eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.h 40KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. /* Unicode Character Database API
  2. *
  3. * Copyright (C) 2012-2017 Reece H. Dunn
  4. *
  5. * This file is part of ucd-tools.
  6. *
  7. * ucd-tools is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * ucd-tools is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifndef UNICODE_CHARACTER_DATA_H
  21. #define UNICODE_CHARACTER_DATA_H
  22. #include <stdint.h>
  23. #ifdef __cplusplus
  24. extern "C"
  25. {
  26. #endif
  27. /** @brief Represents a Unicode codepoint.
  28. */
  29. typedef uint32_t codepoint_t;
  30. /** @brief Unicode General Category Groups
  31. * @see http://www.unicode.org/reports/tr44/
  32. */
  33. typedef enum ucd_category_group_
  34. {
  35. UCD_CATEGORY_GROUP_C, /**< @brief Other */
  36. UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  37. UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  38. UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  39. UCD_CATEGORY_GROUP_N, /**< @brief Number */
  40. UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  41. UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  42. UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  43. } ucd_category_group;
  44. /** @brief Get a string representation of the category_group enumeration value.
  45. *
  46. * @param c The value to get the string representation for.
  47. *
  48. * @return The string representation, or "-" if the value is not recognized.
  49. */
  50. const char *ucd_get_category_group_string(ucd_category_group c);
  51. /** @brief Unicode General Category Values
  52. * @see http://www.unicode.org/reports/tr44/
  53. */
  54. typedef enum ucd_category_
  55. {
  56. UCD_CATEGORY_Cc, /**< @brief Control Character */
  57. UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  58. UCD_CATEGORY_Cn, /**< @brief Unassigned */
  59. UCD_CATEGORY_Co, /**< @brief Private Use */
  60. UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  61. UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  62. UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  63. UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  64. UCD_CATEGORY_Lo, /**< @brief Other Letter */
  65. UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  66. UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  67. UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  68. UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  69. UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  70. UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  71. UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  72. UCD_CATEGORY_No, /**< @brief Other Number */
  73. UCD_CATEGORY_Pc, /**< @brief Connector */
  74. UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  75. UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  76. UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  77. UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  78. UCD_CATEGORY_Po, /**< @brief Other */
  79. UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  80. UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  81. UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  82. UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  83. UCD_CATEGORY_So, /**< @brief Other Symbol */
  84. UCD_CATEGORY_Zl, /**< @brief Line Separator */
  85. UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  86. UCD_CATEGORY_Zs, /**< @brief Space Separator */
  87. } ucd_category;
  88. /** @brief Get a string representation of the category enumeration value.
  89. *
  90. * @param c The value to get the string representation for.
  91. *
  92. * @return The string representation, or "--" if the value is not recognized.
  93. */
  94. const char *ucd_get_category_string(ucd_category c);
  95. /** @brief Lookup the General Category Group for a General Category.
  96. *
  97. * @param c The General Category to lookup.
  98. * @return The General Category Group of the General Category.
  99. */
  100. ucd_category_group ucd_get_category_group_for_category(ucd_category c);
  101. /** @brief Lookup the General Category Group for a Unicode codepoint.
  102. *
  103. * @param c The Unicode codepoint to lookup.
  104. * @return The General Category Group of the Unicode codepoint.
  105. */
  106. ucd_category_group ucd_lookup_category_group(codepoint_t c);
  107. /** @brief Lookup the General Category for a Unicode codepoint.
  108. *
  109. * @param c The Unicode codepoint to lookup.
  110. * @return The General Category of the Unicode codepoint.
  111. */
  112. ucd_category ucd_lookup_category(codepoint_t c);
  113. /** @brief Unicode Script
  114. * @see http://www.iana.org/assignments/language-subtag-registry
  115. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  116. */
  117. typedef enum ucd_script_
  118. {
  119. UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
  120. UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  121. UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  122. UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  123. UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  124. UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  125. UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  126. UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  127. UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  128. UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  129. UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  130. UCD_SCRIPT_Batk, /**< @brief Batak Script */
  131. UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  132. UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
  133. UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  134. UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  135. UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  136. UCD_SCRIPT_Brai, /**< @brief Braille Script */
  137. UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  138. UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  139. UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  140. UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  141. UCD_SCRIPT_Cari, /**< @brief Carian Script */
  142. UCD_SCRIPT_Cham, /**< @brief Cham Script */
  143. UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  144. UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  145. UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  146. UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  147. UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  148. UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  149. UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  150. UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  151. UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  152. UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  153. UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  154. UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  155. UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  156. UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  157. UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  158. UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  159. UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  160. UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  161. UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  162. UCD_SCRIPT_Grek, /**< @brief Greek Script */
  163. UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  164. UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  165. UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  166. UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  167. UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  168. UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  169. UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  170. UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  171. UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  172. UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  173. UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  174. UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  175. UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  176. UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  177. UCD_SCRIPT_Inds, /**< @brief Indus Script */
  178. UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  179. UCD_SCRIPT_Java, /**< @brief Javanese Script */
  180. UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  181. UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  182. UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  183. UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  184. UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  185. UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  186. UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  187. UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  188. UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  189. UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  190. UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  191. UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  192. UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  193. UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  194. UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  195. UCD_SCRIPT_Latn, /**< @brief Latin Script */
  196. UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  197. UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  198. UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  199. UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  200. UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  201. UCD_SCRIPT_Loma, /**< @brief Loma Script */
  202. UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  203. UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  204. UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  205. UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  206. UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  207. UCD_SCRIPT_Marc, /**< @brief Marchen Script */
  208. UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  209. UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  210. UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  211. UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  212. UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  213. UCD_SCRIPT_Modi, /**< @brief Modi Script */
  214. UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  215. UCD_SCRIPT_Moon, /**< @brief Moon Script */
  216. UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  217. UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  218. UCD_SCRIPT_Mult, /**< @brief Multani Script */
  219. UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  220. UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  221. UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  222. UCD_SCRIPT_Newa, /**< @brief Newa Script */
  223. UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  224. UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  225. UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  226. UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  227. UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  228. UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  229. UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  230. UCD_SCRIPT_Osge, /**< @brief Osage Script */
  231. UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  232. UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  233. UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  234. UCD_SCRIPT_Perm, /**< @brief Old Permic */
  235. UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  236. UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  237. UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  238. UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  239. UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  240. UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  241. UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  242. UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  243. UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  244. UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  245. UCD_SCRIPT_Runr, /**< @brief Runic Script */
  246. UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  247. UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  248. UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  249. UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  250. UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  251. UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  252. UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  253. UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  254. UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  255. UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  256. UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  257. UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  258. UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  259. UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  260. UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  261. UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  262. UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  263. UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  264. UCD_SCRIPT_Takr, /**< @brief Takri Script */
  265. UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  266. UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  267. UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  268. UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  269. UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  270. UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  271. UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  272. UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  273. UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  274. UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  275. UCD_SCRIPT_Thai, /**< @brief Thai Script */
  276. UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  277. UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  278. UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  279. UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  280. UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  281. UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  282. UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  283. UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  284. UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  285. UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  286. UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  287. UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  288. UCD_SCRIPT_Zsym, /**< @brief Symbols */
  289. UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  290. UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  291. UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  292. } ucd_script;
  293. /** @brief Get a string representation of the script enumeration value.
  294. *
  295. * @param s The value to get the string representation for.
  296. *
  297. * @return The string representation, or "----" if the value is not recognized.
  298. */
  299. const char *ucd_get_script_string(ucd_script s);
  300. /** @brief Lookup the Script for a Unicode codepoint.
  301. *
  302. * @param c The Unicode codepoint to lookup.
  303. * @return The Script of the Unicode codepoint.
  304. */
  305. ucd_script ucd_lookup_script(codepoint_t c);
  306. /** @brief Properties
  307. */
  308. typedef enum ucd_property_
  309. {
  310. UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space PropList */
  311. UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> DispositionType (enabled check only) */
  312. UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control PropList */
  313. UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control PropList */
  314. UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash PropList */
  315. UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen PropList */
  316. UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark PropList */
  317. UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000080, /**< @brief Terminal_Punctuation PropList */
  318. UCD_PROPERTY_OTHER_MATH = 0x00000100, /**< @brief Other_Math PropList */
  319. UCD_PROPERTY_HEX_DIGIT = 0x00000200, /**< @brief Hex_Digit PropList */
  320. UCD_PROPERTY_ASCII_HEX_DIGIT = 0x00000400, /**< @brief ASCII_Hex_Digit PropList */
  321. UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
  322. UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
  323. UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
  324. UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
  325. UCD_PROPERTY_OTHER_LOWERCASE = 0x00008000, /**< @brief Other_Lowercase PropList */
  326. } ucd_property;
  327. /** @brief Return the properties of the specified codepoint.
  328. *
  329. * @param c The Unicode codepoint to lookup.
  330. * @param category The General Category of the codepoint.
  331. * @return The properties associated with the codepoint.
  332. */
  333. ucd_property ucd_properties(codepoint_t c, ucd_category category);
  334. /** @brief Is the codepoint in the 'alnum' class?
  335. *
  336. * @param c The Unicode codepoint to check.
  337. * @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
  338. */
  339. int ucd_isalnum(codepoint_t c);
  340. /** @brief Is the codepoint in the 'alpha' class?
  341. *
  342. * @param c The Unicode codepoint to check.
  343. * @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
  344. */
  345. int ucd_isalpha(codepoint_t c);
  346. /** @brief Is the codepoint in the 'blank' class?
  347. *
  348. * @param c The Unicode codepoint to check.
  349. * @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
  350. */
  351. int ucd_isblank(codepoint_t c);
  352. /** @brief Is the codepoint in the 'cntrl' class?
  353. *
  354. * @param c The Unicode codepoint to check.
  355. * @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
  356. */
  357. int ucd_iscntrl(codepoint_t c);
  358. /** @brief Is the codepoint in the 'digit' class?
  359. *
  360. * @param c The Unicode codepoint to check.
  361. * @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
  362. */
  363. int ucd_isdigit(codepoint_t c);
  364. /** @brief Is the codepoint in the 'graph' class?
  365. *
  366. * @param c The Unicode codepoint to check.
  367. * @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
  368. */
  369. int ucd_isgraph(codepoint_t c);
  370. /** @brief Is the codepoint in the 'lower' class?
  371. *
  372. * @param c The Unicode codepoint to check.
  373. * @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
  374. */
  375. int ucd_islower(codepoint_t c);
  376. /** @brief Is the codepoint in the 'print' class?
  377. *
  378. * @param c The Unicode codepoint to check.
  379. * @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
  380. */
  381. int ucd_isprint(codepoint_t c);
  382. /** @brief Is the codepoint in the 'punct' class?
  383. *
  384. * @param c The Unicode codepoint to check.
  385. * @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
  386. */
  387. int ucd_ispunct(codepoint_t c);
  388. /** @brief Is the codepoint in the 'space' class?
  389. *
  390. * @param c The Unicode codepoint to check.
  391. * @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
  392. */
  393. int ucd_isspace(codepoint_t c);
  394. /** @brief Is the codepoint in the 'upper' class?
  395. *
  396. * @param c The Unicode codepoint to check.
  397. * @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
  398. */
  399. int ucd_isupper(codepoint_t c);
  400. /** @brief Is the codepoint in the 'xdigit' class?
  401. *
  402. * @param c The Unicode codepoint to check.
  403. * @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
  404. */
  405. int ucd_isxdigit(codepoint_t c);
  406. /** @brief Convert the Unicode codepoint to upper-case.
  407. *
  408. * This function only uses the simple case mapping present in the
  409. * UnicodeData file. The data in SpecialCasing requires Unicode
  410. * codepoints to be mapped to multiple codepoints.
  411. *
  412. * @param c The Unicode codepoint to convert.
  413. * @return The upper-case Unicode codepoint for this codepoint, or
  414. * this codepoint if there is no upper-case codepoint.
  415. */
  416. codepoint_t ucd_toupper(codepoint_t c);
  417. /** @brief Convert the Unicode codepoint to lower-case.
  418. *
  419. * This function only uses the simple case mapping present in the
  420. * UnicodeData file. The data in SpecialCasing requires Unicode
  421. * codepoints to be mapped to multiple codepoints.
  422. *
  423. * @param c The Unicode codepoint to convert.
  424. * @return The lower-case Unicode codepoint for this codepoint, or
  425. * this codepoint if there is no upper-case codepoint.
  426. */
  427. codepoint_t ucd_tolower(codepoint_t c);
  428. /** @brief Convert the Unicode codepoint to title-case.
  429. *
  430. * This function only uses the simple case mapping present in the
  431. * UnicodeData file. The data in SpecialCasing requires Unicode
  432. * codepoints to be mapped to multiple codepoints.
  433. *
  434. * @param c The Unicode codepoint to convert.
  435. * @return The title-case Unicode codepoint for this codepoint, or
  436. * this codepoint if there is no upper-case codepoint.
  437. */
  438. codepoint_t ucd_totitle(codepoint_t c);
  439. #ifdef __cplusplus
  440. }
  441. /** @brief Unicode Character Database
  442. */
  443. namespace ucd
  444. {
  445. /** @brief Represents a Unicode codepoint.
  446. */
  447. using ::codepoint_t;
  448. /** @brief Unicode General Category Groups
  449. * @see http://www.unicode.org/reports/tr44/
  450. */
  451. enum category_group
  452. {
  453. C = UCD_CATEGORY_GROUP_C, /**< @brief Other */
  454. I = UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  455. L = UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  456. M = UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  457. N = UCD_CATEGORY_GROUP_N, /**< @brief Number */
  458. P = UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  459. S = UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  460. Z = UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  461. };
  462. /** @brief Get a string representation of the category_group enumeration value.
  463. *
  464. * @param c The value to get the string representation for.
  465. *
  466. * @return The string representation, or "-" if the value is not recognized.
  467. */
  468. inline const char *get_category_group_string(category_group c)
  469. {
  470. return ucd_get_category_group_string((ucd_category_group)c);
  471. }
  472. /** @brief Unicode General Category Values
  473. * @see http://www.unicode.org/reports/tr44/
  474. */
  475. enum category
  476. {
  477. Cc = UCD_CATEGORY_Cc, /**< @brief Control Character */
  478. Cf = UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  479. Cn = UCD_CATEGORY_Cn, /**< @brief Unassigned */
  480. Co = UCD_CATEGORY_Co, /**< @brief Private Use */
  481. Cs = UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  482. Ii = UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  483. Ll = UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  484. Lm = UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  485. Lo = UCD_CATEGORY_Lo, /**< @brief Other Letter */
  486. Lt = UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  487. Lu = UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  488. Mc = UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  489. Me = UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  490. Mn = UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  491. Nd = UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  492. Nl = UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  493. No = UCD_CATEGORY_No, /**< @brief Other Number */
  494. Pc = UCD_CATEGORY_Pc, /**< @brief Connector */
  495. Pd = UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  496. Pe = UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  497. Pf = UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  498. Pi = UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  499. Po = UCD_CATEGORY_Po, /**< @brief Other */
  500. Ps = UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  501. Sc = UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  502. Sk = UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  503. Sm = UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  504. So = UCD_CATEGORY_So, /**< @brief Other Symbol */
  505. Zl = UCD_CATEGORY_Zl, /**< @brief Line Separator */
  506. Zp = UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  507. Zs = UCD_CATEGORY_Zs, /**< @brief Space Separator */
  508. };
  509. /** @brief Get a string representation of the category enumeration value.
  510. *
  511. * @param c The value to get the string representation for.
  512. *
  513. * @return The string representation, or "--" if the value is not recognized.
  514. */
  515. inline const char *get_category_string(category c)
  516. {
  517. return ucd_get_category_string((ucd_category)c);
  518. }
  519. /** @brief Lookup the General Category Group for a General Category.
  520. *
  521. * @param c The General Category to lookup.
  522. * @return The General Category Group of the General Category.
  523. */
  524. inline category_group lookup_category_group(category c)
  525. {
  526. return (category_group)ucd_get_category_group_for_category((ucd_category)c);
  527. }
  528. /** @brief Lookup the General Category Group for a Unicode codepoint.
  529. *
  530. * @param c The Unicode codepoint to lookup.
  531. * @return The General Category Group of the Unicode codepoint.
  532. */
  533. inline category_group lookup_category_group(codepoint_t c)
  534. {
  535. return (category_group)ucd_lookup_category_group(c);
  536. }
  537. /** @brief Lookup the General Category for a Unicode codepoint.
  538. *
  539. * @param c The Unicode codepoint to lookup.
  540. * @return The General Category of the Unicode codepoint.
  541. */
  542. inline category lookup_category(codepoint_t c)
  543. {
  544. return (category)ucd_lookup_category(c);
  545. }
  546. /** @brief Unicode Script
  547. * @see http://www.iana.org/assignments/language-subtag-registry
  548. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  549. */
  550. enum script
  551. {
  552. Adlm = UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
  553. Afak = UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  554. Aghb = UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  555. Ahom = UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  556. Arab = UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  557. Armi = UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  558. Armn = UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  559. Avst = UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  560. Bali = UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  561. Bamu = UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  562. Bass = UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  563. Batk = UCD_SCRIPT_Batk, /**< @brief Batak Script */
  564. Beng = UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  565. Bhks = UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
  566. Blis = UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  567. Bopo = UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  568. Brah = UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  569. Brai = UCD_SCRIPT_Brai, /**< @brief Braille Script */
  570. Bugi = UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  571. Buhd = UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  572. Cakm = UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  573. Cans = UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  574. Cari = UCD_SCRIPT_Cari, /**< @brief Carian Script */
  575. Cham = UCD_SCRIPT_Cham, /**< @brief Cham Script */
  576. Cher = UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  577. Cirt = UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  578. Copt = UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  579. Cprt = UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  580. Cyrl = UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  581. Cyrs = UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  582. Deva = UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  583. Dsrt = UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  584. Dupl = UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  585. Egyd = UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  586. Egyh = UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  587. Egyp = UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  588. Elba = UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  589. Ethi = UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  590. Geok = UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  591. Geor = UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  592. Glag = UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  593. Goth = UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  594. Gran = UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  595. Grek = UCD_SCRIPT_Grek, /**< @brief Greek Script */
  596. Gujr = UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  597. Guru = UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  598. Hang = UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  599. Hani = UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  600. Hano = UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  601. Hans = UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  602. Hant = UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  603. Hatr = UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  604. Hebr = UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  605. Hira = UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  606. Hluw = UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  607. Hmng = UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  608. Hrkt = UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  609. Hung = UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  610. Inds = UCD_SCRIPT_Inds, /**< @brief Indus Script */
  611. Ital = UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  612. Java = UCD_SCRIPT_Java, /**< @brief Javanese Script */
  613. Jpan = UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  614. Jurc = UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  615. Kali = UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  616. Kana = UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  617. Khar = UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  618. Khmr = UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  619. Khoj = UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  620. Knda = UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  621. Kore = UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  622. Kpel = UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  623. Kthi = UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  624. Lana = UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  625. Laoo = UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  626. Latf = UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  627. Latg = UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  628. Latn = UCD_SCRIPT_Latn, /**< @brief Latin Script */
  629. Lepc = UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  630. Limb = UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  631. Lina = UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  632. Linb = UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  633. Lisu = UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  634. Loma = UCD_SCRIPT_Loma, /**< @brief Loma Script */
  635. Lyci = UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  636. Lydi = UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  637. Mahj = UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  638. Mand = UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  639. Mani = UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  640. Marc = UCD_SCRIPT_Marc, /**< @brief Marchen Script */
  641. Maya = UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  642. Mend = UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  643. Merc = UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  644. Mero = UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  645. Mlym = UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  646. Modi = UCD_SCRIPT_Modi, /**< @brief Modi Script */
  647. Mong = UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  648. Moon = UCD_SCRIPT_Moon, /**< @brief Moon Script */
  649. Mroo = UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  650. Mtei = UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  651. Mult = UCD_SCRIPT_Mult, /**< @brief Multani Script */
  652. Mymr = UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  653. Narb = UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  654. Nbat = UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  655. Newa = UCD_SCRIPT_Newa, /**< @brief Newa Script */
  656. Nkgb = UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  657. Nkoo = UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  658. Nshu = UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  659. Ogam = UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  660. Olck = UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  661. Orkh = UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  662. Orya = UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  663. Osge = UCD_SCRIPT_Osge, /**< @brief Osage Script */
  664. Osma = UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  665. Palm = UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  666. Pauc = UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  667. Perm = UCD_SCRIPT_Perm, /**< @brief Old Permic */
  668. Phag = UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  669. Phli = UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  670. Phlp = UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  671. Phlv = UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  672. Phnx = UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  673. Plrd = UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  674. Prti = UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  675. Qaak = UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  676. Rjng = UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  677. Roro = UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  678. Runr = UCD_SCRIPT_Runr, /**< @brief Runic Script */
  679. Samr = UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  680. Sara = UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  681. Sarb = UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  682. Saur = UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  683. Sgnw = UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  684. Shaw = UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  685. Shrd = UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  686. Sidd = UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  687. Sind = UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  688. Sinh = UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  689. Sora = UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  690. Sund = UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  691. Sylo = UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  692. Syrc = UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  693. Syre = UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  694. Syrj = UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  695. Syrn = UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  696. Tagb = UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  697. Takr = UCD_SCRIPT_Takr, /**< @brief Takri Script */
  698. Tale = UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  699. Talu = UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  700. Taml = UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  701. Tang = UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  702. Tavt = UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  703. Telu = UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  704. Teng = UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  705. Tfng = UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  706. Tglg = UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  707. Thaa = UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  708. Thai = UCD_SCRIPT_Thai, /**< @brief Thai Script */
  709. Tibt = UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  710. Tirh = UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  711. Ugar = UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  712. Vaii = UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  713. Visp = UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  714. Wara = UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  715. Wole = UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  716. Xpeo = UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  717. Xsux = UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  718. Yiii = UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  719. Zinh = UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  720. Zmth = UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  721. Zsym = UCD_SCRIPT_Zsym, /**< @brief Symbols */
  722. Zxxx = UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  723. Zyyy = UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  724. Zzzz = UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  725. };
  726. /** @brief Get a string representation of the script enumeration value.
  727. *
  728. * @param s The value to get the string representation for.
  729. *
  730. * @return The string representation, or "----" if the value is not recognized.
  731. */
  732. inline const char *get_script_string(script s)
  733. {
  734. return ucd_get_script_string((ucd_script)s);
  735. }
  736. /** @brief Lookup the Script for a Unicode codepoint.
  737. *
  738. * @param c The Unicode codepoint to lookup.
  739. * @return The Script of the Unicode codepoint.
  740. */
  741. inline script lookup_script(codepoint_t c)
  742. {
  743. return (script)ucd_lookup_script(c);
  744. }
  745. /** @brief Properties
  746. */
  747. enum property
  748. {
  749. White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space PropList */
  750. noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> DispositionType (enabled check only) */
  751. Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control PropList */
  752. Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control PropList */
  753. Dash = UCD_PROPERTY_DASH, /**< @brief Dash PropList */
  754. Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen PropList */
  755. Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark PropList */
  756. Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation PropList */
  757. Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math PropList */
  758. Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit PropList */
  759. ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit PropList */
  760. Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic PropList */
  761. Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic PropList */
  762. Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic PropList */
  763. Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender PropList */
  764. Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase PropList */
  765. };
  766. /** @brief Return the properties of the specified codepoint.
  767. *
  768. * @param c The Unicode codepoint to lookup.
  769. * @param cat The General Category of the codepoint.
  770. * @return The properties associated with the codepoint.
  771. */
  772. inline property properties(codepoint_t c, category cat)
  773. {
  774. return (property)ucd_properties(c, (ucd_category)cat);
  775. }
  776. /** @brief Is the codepoint in the 'alnum' class?
  777. *
  778. * @param c The Unicode codepoint to check.
  779. * @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
  780. */
  781. inline int isalnum(codepoint_t c)
  782. {
  783. return ucd_isalnum(c);
  784. }
  785. /** @brief Is the codepoint in the 'alpha' class?
  786. *
  787. * @param c The Unicode codepoint to check.
  788. * @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
  789. */
  790. inline int isalpha(codepoint_t c)
  791. {
  792. return ucd_isalpha(c);
  793. }
  794. /** @brief Is the codepoint in the 'blank' class?
  795. *
  796. * @param c The Unicode codepoint to check.
  797. * @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
  798. */
  799. inline int isblank(codepoint_t c)
  800. {
  801. return ucd_isblank(c);
  802. }
  803. /** @brief Is the codepoint in the 'cntrl' class?
  804. *
  805. * @param c The Unicode codepoint to check.
  806. * @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
  807. */
  808. inline int iscntrl(codepoint_t c)
  809. {
  810. return ucd_iscntrl(c);
  811. }
  812. /** @brief Is the codepoint in the 'digit' class?
  813. *
  814. * @param c The Unicode codepoint to check.
  815. * @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
  816. */
  817. inline int isdigit(codepoint_t c)
  818. {
  819. return ucd_isdigit(c);
  820. }
  821. /** @brief Is the codepoint in the 'graph' class?
  822. *
  823. * @param c The Unicode codepoint to check.
  824. * @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
  825. */
  826. inline int isgraph(codepoint_t c)
  827. {
  828. return ucd_isgraph(c);
  829. }
  830. /** @brief Is the codepoint in the 'lower' class?
  831. *
  832. * @param c The Unicode codepoint to check.
  833. * @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
  834. */
  835. inline int islower(codepoint_t c)
  836. {
  837. return ucd_islower(c);
  838. }
  839. /** @brief Is the codepoint in the 'print' class?
  840. *
  841. * @param c The Unicode codepoint to check.
  842. * @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
  843. */
  844. inline int isprint(codepoint_t c)
  845. {
  846. return ucd_isprint(c);
  847. }
  848. /** @brief Is the codepoint in the 'punct' class?
  849. *
  850. * @param c The Unicode codepoint to check.
  851. * @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
  852. */
  853. inline int ispunct(codepoint_t c)
  854. {
  855. return ucd_ispunct(c);
  856. }
  857. /** @brief Is the codepoint in the 'space' class?
  858. *
  859. * @param c The Unicode codepoint to check.
  860. * @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
  861. */
  862. inline int isspace(codepoint_t c)
  863. {
  864. return ucd_isspace(c);
  865. }
  866. /** @brief Is the codepoint in the 'upper' class?
  867. *
  868. * @param c The Unicode codepoint to check.
  869. * @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
  870. */
  871. inline int isupper(codepoint_t c)
  872. {
  873. return ucd_isupper(c);
  874. }
  875. /** @brief Is the codepoint in the 'xdigit' class?
  876. *
  877. * @param c The Unicode codepoint to check.
  878. * @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
  879. */
  880. inline int isxdigit(codepoint_t c)
  881. {
  882. return ucd_isxdigit(c);
  883. }
  884. /** @brief Convert the Unicode codepoint to upper-case.
  885. *
  886. * This function only uses the simple case mapping present in the
  887. * UnicodeData file. The data in SpecialCasing requires Unicode
  888. * codepoints to be mapped to multiple codepoints.
  889. *
  890. * @param c The Unicode codepoint to convert.
  891. * @return The upper-case Unicode codepoint for this codepoint, or
  892. * this codepoint if there is no upper-case codepoint.
  893. */
  894. inline codepoint_t toupper(codepoint_t c)
  895. {
  896. return ucd_toupper(c);
  897. }
  898. /** @brief Convert the Unicode codepoint to lower-case.
  899. *
  900. * This function only uses the simple case mapping present in the
  901. * UnicodeData file. The data in SpecialCasing requires Unicode
  902. * codepoints to be mapped to multiple codepoints.
  903. *
  904. * @param c The Unicode codepoint to convert.
  905. * @return The lower-case Unicode codepoint for this codepoint, or
  906. * this codepoint if there is no upper-case codepoint.
  907. */
  908. inline codepoint_t tolower(codepoint_t c)
  909. {
  910. return ucd_tolower(c);
  911. }
  912. /** @brief Convert the Unicode codepoint to title-case.
  913. *
  914. * This function only uses the simple case mapping present in the
  915. * UnicodeData file. The data in SpecialCasing requires Unicode
  916. * codepoints to be mapped to multiple codepoints.
  917. *
  918. * @param c The Unicode codepoint to convert.
  919. * @return The title-case Unicode codepoint for this codepoint, or
  920. * this codepoint if there is no upper-case codepoint.
  921. */
  922. inline codepoint_t totitle(codepoint_t c)
  923. {
  924. return ucd_totitle(c);
  925. }
  926. }
  927. #endif
  928. #endif