eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.h 35KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. /* Unicode Character Database API
  2. *
  3. * Copyright (C) 2012-2016 Reece H. Dunn
  4. *
  5. * This file is part of ucd-tools.
  6. *
  7. * ucd-tools is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * ucd-tools is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifndef UNICODE_CHARACTER_DATA_H
  21. #define UNICODE_CHARACTER_DATA_H
  22. #include <stdint.h>
  23. #ifdef __cplusplus
  24. extern "C"
  25. {
  26. #endif
  27. /** @brief Represents a Unicode codepoint.
  28. */
  29. typedef uint32_t codepoint_t;
  30. /** @brief Unicode General Category Groups
  31. * @see http://www.unicode.org/reports/tr44/
  32. */
  33. typedef enum ucd_category_group_
  34. {
  35. UCD_CATEGORY_GROUP_C, /**< @brief Other */
  36. UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  37. UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  38. UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  39. UCD_CATEGORY_GROUP_N, /**< @brief Number */
  40. UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  41. UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  42. UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  43. } ucd_category_group;
  44. /** @brief Get a string representation of the category_group enumeration value.
  45. *
  46. * @param c The value to get the string representation for.
  47. *
  48. * @return The string representation, or "-" if the value is not recognized.
  49. */
  50. const char *ucd_get_category_group_string(ucd_category_group c);
  51. /** @brief Unicode General Category Values
  52. * @see http://www.unicode.org/reports/tr44/
  53. */
  54. typedef enum ucd_category_
  55. {
  56. UCD_CATEGORY_Cc, /**< @brief Control Character */
  57. UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  58. UCD_CATEGORY_Cn, /**< @brief Unassigned */
  59. UCD_CATEGORY_Co, /**< @brief Private Use */
  60. UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  61. UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  62. UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  63. UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  64. UCD_CATEGORY_Lo, /**< @brief Other Letter */
  65. UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  66. UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  67. UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  68. UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  69. UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  70. UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  71. UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  72. UCD_CATEGORY_No, /**< @brief Other Number */
  73. UCD_CATEGORY_Pc, /**< @brief Connector */
  74. UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  75. UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  76. UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  77. UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  78. UCD_CATEGORY_Po, /**< @brief Other */
  79. UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  80. UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  81. UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  82. UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  83. UCD_CATEGORY_So, /**< @brief Other Symbol */
  84. UCD_CATEGORY_Zl, /**< @brief Line Separator */
  85. UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  86. UCD_CATEGORY_Zs, /**< @brief Space Separator */
  87. } ucd_category;
  88. /** @brief Get a string representation of the category enumeration value.
  89. *
  90. * @param c The value to get the string representation for.
  91. *
  92. * @return The string representation, or "--" if the value is not recognized.
  93. */
  94. const char *ucd_get_category_string(ucd_category c);
  95. /** @brief Lookup the General Category Group for a General Category.
  96. *
  97. * @param c The General Category to lookup.
  98. * @return The General Category Group of the General Category.
  99. */
  100. ucd_category_group ucd_get_category_group_for_category(ucd_category c);
  101. /** @brief Lookup the General Category Group for a Unicode codepoint.
  102. *
  103. * @param c The Unicode codepoint to lookup.
  104. * @return The General Category Group of the Unicode codepoint.
  105. */
  106. ucd_category_group ucd_lookup_category_group(codepoint_t c);
  107. /** @brief Lookup the General Category for a Unicode codepoint.
  108. *
  109. * @param c The Unicode codepoint to lookup.
  110. * @return The General Category of the Unicode codepoint.
  111. */
  112. ucd_category ucd_lookup_category(codepoint_t c);
  113. /** @brief Unicode Script
  114. * @see http://www.iana.org/assignments/language-subtag-registry
  115. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  116. */
  117. typedef enum ucd_script_
  118. {
  119. UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  120. UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  121. UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  122. UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  123. UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  124. UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  125. UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  126. UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  127. UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  128. UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  129. UCD_SCRIPT_Batk, /**< @brief Batak Script */
  130. UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  131. UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  132. UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  133. UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  134. UCD_SCRIPT_Brai, /**< @brief Braille Script */
  135. UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  136. UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  137. UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  138. UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  139. UCD_SCRIPT_Cari, /**< @brief Carian Script */
  140. UCD_SCRIPT_Cham, /**< @brief Cham Script */
  141. UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  142. UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  143. UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  144. UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  145. UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  146. UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  147. UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  148. UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  149. UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  150. UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  151. UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  152. UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  153. UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  154. UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  155. UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  156. UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  157. UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  158. UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  159. UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  160. UCD_SCRIPT_Grek, /**< @brief Greek Script */
  161. UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  162. UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  163. UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  164. UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  165. UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  166. UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  167. UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  168. UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  169. UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  170. UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  171. UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  172. UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  173. UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  174. UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  175. UCD_SCRIPT_Inds, /**< @brief Indus Script */
  176. UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  177. UCD_SCRIPT_Java, /**< @brief Javanese Script */
  178. UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  179. UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  180. UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  181. UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  182. UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  183. UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  184. UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  185. UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  186. UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  187. UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  188. UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  189. UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  190. UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  191. UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  192. UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  193. UCD_SCRIPT_Latn, /**< @brief Latin Script */
  194. UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  195. UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  196. UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  197. UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  198. UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  199. UCD_SCRIPT_Loma, /**< @brief Loma Script */
  200. UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  201. UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  202. UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  203. UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  204. UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  205. UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  206. UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  207. UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  208. UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  209. UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  210. UCD_SCRIPT_Modi, /**< @brief Modi Script */
  211. UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  212. UCD_SCRIPT_Moon, /**< @brief Moon Script */
  213. UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  214. UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  215. UCD_SCRIPT_Mult, /**< @brief Multani Script */
  216. UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  217. UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  218. UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  219. UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  220. UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  221. UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  222. UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  223. UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  224. UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  225. UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  226. UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  227. UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  228. UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  229. UCD_SCRIPT_Perm, /**< @brief Old Permic */
  230. UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  231. UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  232. UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  233. UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  234. UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  235. UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  236. UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  237. UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  238. UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  239. UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  240. UCD_SCRIPT_Runr, /**< @brief Runic Script */
  241. UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  242. UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  243. UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  244. UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  245. UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  246. UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  247. UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  248. UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  249. UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  250. UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  251. UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  252. UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  253. UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  254. UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  255. UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  256. UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  257. UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  258. UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  259. UCD_SCRIPT_Takr, /**< @brief Takri Script */
  260. UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  261. UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  262. UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  263. UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  264. UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  265. UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  266. UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  267. UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  268. UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  269. UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  270. UCD_SCRIPT_Thai, /**< @brief Thai Script */
  271. UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  272. UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  273. UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  274. UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  275. UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  276. UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  277. UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  278. UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  279. UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  280. UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  281. UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  282. UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  283. UCD_SCRIPT_Zsym, /**< @brief Symbols */
  284. UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  285. UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  286. UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  287. } ucd_script;
  288. /** @brief Get a string representation of the script enumeration value.
  289. *
  290. * @param s The value to get the string representation for.
  291. *
  292. * @return The string representation, or "----" if the value is not recognized.
  293. */
  294. const char *ucd_get_script_string(ucd_script s);
  295. /** @brief Lookup the Script for a Unicode codepoint.
  296. *
  297. * @param c The Unicode codepoint to lookup.
  298. * @return The Script of the Unicode codepoint.
  299. */
  300. ucd_script ucd_lookup_script(codepoint_t c);
  301. /** @brief Is the codepoint an alpha-numeric character?
  302. *
  303. * @param c The Unicode codepoint to check.
  304. * @return Non-zero if the codepoint is a letter or number, zero otherwise.
  305. */
  306. int ucd_isalnum(codepoint_t c);
  307. /** @brief Is the codepoint a letter?
  308. *
  309. * @param c The Unicode codepoint to check.
  310. * @return Non-zero if the codepoint is a letter, zero otherwise.
  311. */
  312. int ucd_isalpha(codepoint_t c);
  313. /** @brief Is the codepoint a control character?
  314. *
  315. * @param c The Unicode codepoint to check.
  316. * @return Non-zero if the codepoint is a control character, zero otherwise.
  317. */
  318. int ucd_iscntrl(codepoint_t c);
  319. /** @brief Is the codepoint a numeric character?
  320. *
  321. * @param c The Unicode codepoint to check.
  322. * @return Non-zero if the codepoint is a number, zero otherwise.
  323. */
  324. int ucd_isdigit(codepoint_t c);
  325. /** @brief Does the codepoint have a displayable glyph?
  326. *
  327. * @param c The Unicode codepoint to check.
  328. * @return Non-zero if the codepoint has a displayable glyph, zero otherwise.
  329. */
  330. int ucd_isgraph(codepoint_t c);
  331. /** @brief Is the codepoint a lower-case letter?
  332. *
  333. * @param c The Unicode codepoint to check.
  334. * @return Non-zero if the codepoint is a lower-case letter, zero otherwise.
  335. */
  336. int ucd_islower(codepoint_t c);
  337. /** @brief Is the codepoint a printable character?
  338. *
  339. * @param c The Unicode codepoint to check.
  340. * @return Non-zero if the codepoint is a printable character, zero otherwise.
  341. */
  342. int ucd_isprint(codepoint_t c);
  343. /** @brief Is the codepoint a punctuation character?
  344. *
  345. * @param c The Unicode codepoint to check.
  346. * @return Non-zero if the codepoint is a punctuation character, zero otherwise.
  347. */
  348. int ucd_ispunct(codepoint_t c);
  349. /** @brief Is the codepoint a whitespace character?
  350. *
  351. * @param c The Unicode codepoint to check.
  352. * @return Non-zero if the codepoint is a whitespace character, zero otherwise.
  353. */
  354. int ucd_isspace(codepoint_t c);
  355. /** @brief Is the codepoint an upper-case letter?
  356. *
  357. * @param c The Unicode codepoint to check.
  358. * @return Non-zero if the codepoint is an upper-case letter, zero otherwise.
  359. */
  360. int ucd_isupper(codepoint_t c);
  361. /** @brief Convert the Unicode codepoint to upper-case.
  362. *
  363. * This function only uses the simple case mapping present in the
  364. * UnicodeData file. The data in SpecialCasing requires Unicode
  365. * codepoints to be mapped to multiple codepoints.
  366. *
  367. * @param c The Unicode codepoint to convert.
  368. * @return The upper-case Unicode codepoint for this codepoint, or
  369. * this codepoint if there is no upper-case codepoint.
  370. */
  371. codepoint_t ucd_toupper(codepoint_t c);
  372. /** @brief Convert the Unicode codepoint to lower-case.
  373. *
  374. * This function only uses the simple case mapping present in the
  375. * UnicodeData file. The data in SpecialCasing requires Unicode
  376. * codepoints to be mapped to multiple codepoints.
  377. *
  378. * @param c The Unicode codepoint to convert.
  379. * @return The lower-case Unicode codepoint for this codepoint, or
  380. * this codepoint if there is no upper-case codepoint.
  381. */
  382. codepoint_t ucd_tolower(codepoint_t c);
  383. /** @brief Convert the Unicode codepoint to title-case.
  384. *
  385. * This function only uses the simple case mapping present in the
  386. * UnicodeData file. The data in SpecialCasing requires Unicode
  387. * codepoints to be mapped to multiple codepoints.
  388. *
  389. * @param c The Unicode codepoint to convert.
  390. * @return The title-case Unicode codepoint for this codepoint, or
  391. * this codepoint if there is no upper-case codepoint.
  392. */
  393. codepoint_t ucd_totitle(codepoint_t c);
  394. #ifdef __cplusplus
  395. }
  396. /** @brief Unicode Character Database
  397. */
  398. namespace ucd
  399. {
  400. /** @brief Represents a Unicode codepoint.
  401. */
  402. using ::codepoint_t;
  403. /** @brief Unicode General Category Groups
  404. * @see http://www.unicode.org/reports/tr44/
  405. */
  406. enum category_group
  407. {
  408. C = UCD_CATEGORY_GROUP_C, /**< @brief Other */
  409. I = UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  410. L = UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  411. M = UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  412. N = UCD_CATEGORY_GROUP_N, /**< @brief Number */
  413. P = UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  414. S = UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  415. Z = UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  416. };
  417. /** @brief Get a string representation of the category_group enumeration value.
  418. *
  419. * @param c The value to get the string representation for.
  420. *
  421. * @return The string representation, or "-" if the value is not recognized.
  422. */
  423. inline const char *get_category_group_string(category_group c)
  424. {
  425. return ucd_get_category_group_string((ucd_category_group)c);
  426. }
  427. /** @brief Unicode General Category Values
  428. * @see http://www.unicode.org/reports/tr44/
  429. */
  430. enum category
  431. {
  432. Cc = UCD_CATEGORY_Cc, /**< @brief Control Character */
  433. Cf = UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  434. Cn = UCD_CATEGORY_Cn, /**< @brief Unassigned */
  435. Co = UCD_CATEGORY_Co, /**< @brief Private Use */
  436. Cs = UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  437. Ii = UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  438. Ll = UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  439. Lm = UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  440. Lo = UCD_CATEGORY_Lo, /**< @brief Other Letter */
  441. Lt = UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  442. Lu = UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  443. Mc = UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  444. Me = UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  445. Mn = UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  446. Nd = UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  447. Nl = UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  448. No = UCD_CATEGORY_No, /**< @brief Other Number */
  449. Pc = UCD_CATEGORY_Pc, /**< @brief Connector */
  450. Pd = UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  451. Pe = UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  452. Pf = UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  453. Pi = UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  454. Po = UCD_CATEGORY_Po, /**< @brief Other */
  455. Ps = UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  456. Sc = UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  457. Sk = UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  458. Sm = UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  459. So = UCD_CATEGORY_So, /**< @brief Other Symbol */
  460. Zl = UCD_CATEGORY_Zl, /**< @brief Line Separator */
  461. Zp = UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  462. Zs = UCD_CATEGORY_Zs, /**< @brief Space Separator */
  463. };
  464. /** @brief Get a string representation of the category enumeration value.
  465. *
  466. * @param c The value to get the string representation for.
  467. *
  468. * @return The string representation, or "--" if the value is not recognized.
  469. */
  470. inline const char *get_category_string(category c)
  471. {
  472. return ucd_get_category_string((ucd_category)c);
  473. }
  474. /** @brief Lookup the General Category Group for a General Category.
  475. *
  476. * @param c The General Category to lookup.
  477. * @return The General Category Group of the General Category.
  478. */
  479. category_group lookup_category_group(category c);
  480. /** @brief Lookup the General Category Group for a Unicode codepoint.
  481. *
  482. * @param c The Unicode codepoint to lookup.
  483. * @return The General Category Group of the Unicode codepoint.
  484. */
  485. category_group lookup_category_group(codepoint_t c);
  486. /** @brief Lookup the General Category for a Unicode codepoint.
  487. *
  488. * @param c The Unicode codepoint to lookup.
  489. * @return The General Category of the Unicode codepoint.
  490. */
  491. category lookup_category(codepoint_t c);
  492. /** @brief Unicode Script
  493. * @see http://www.iana.org/assignments/language-subtag-registry
  494. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  495. */
  496. enum script
  497. {
  498. Afak = UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  499. Aghb = UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  500. Ahom = UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  501. Arab = UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  502. Armi = UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  503. Armn = UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  504. Avst = UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  505. Bali = UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  506. Bamu = UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  507. Bass = UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  508. Batk = UCD_SCRIPT_Batk, /**< @brief Batak Script */
  509. Beng = UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  510. Blis = UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  511. Bopo = UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  512. Brah = UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  513. Brai = UCD_SCRIPT_Brai, /**< @brief Braille Script */
  514. Bugi = UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  515. Buhd = UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  516. Cakm = UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  517. Cans = UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  518. Cari = UCD_SCRIPT_Cari, /**< @brief Carian Script */
  519. Cham = UCD_SCRIPT_Cham, /**< @brief Cham Script */
  520. Cher = UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  521. Cirt = UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  522. Copt = UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  523. Cprt = UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  524. Cyrl = UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  525. Cyrs = UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  526. Deva = UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  527. Dsrt = UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  528. Dupl = UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  529. Egyd = UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  530. Egyh = UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  531. Egyp = UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  532. Elba = UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  533. Ethi = UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  534. Geok = UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  535. Geor = UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  536. Glag = UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  537. Goth = UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  538. Gran = UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  539. Grek = UCD_SCRIPT_Grek, /**< @brief Greek Script */
  540. Gujr = UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  541. Guru = UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  542. Hang = UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  543. Hani = UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  544. Hano = UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  545. Hans = UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  546. Hant = UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  547. Hatr = UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  548. Hebr = UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  549. Hira = UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  550. Hluw = UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  551. Hmng = UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  552. Hrkt = UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  553. Hung = UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  554. Inds = UCD_SCRIPT_Inds, /**< @brief Indus Script */
  555. Ital = UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  556. Java = UCD_SCRIPT_Java, /**< @brief Javanese Script */
  557. Jpan = UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  558. Jurc = UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  559. Kali = UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  560. Kana = UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  561. Khar = UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  562. Khmr = UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  563. Khoj = UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  564. Knda = UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  565. Kore = UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  566. Kpel = UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  567. Kthi = UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  568. Lana = UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  569. Laoo = UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  570. Latf = UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  571. Latg = UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  572. Latn = UCD_SCRIPT_Latn, /**< @brief Latin Script */
  573. Lepc = UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  574. Limb = UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  575. Lina = UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  576. Linb = UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  577. Lisu = UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  578. Loma = UCD_SCRIPT_Loma, /**< @brief Loma Script */
  579. Lyci = UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  580. Lydi = UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  581. Mahj = UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  582. Mand = UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  583. Mani = UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  584. Maya = UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  585. Mend = UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  586. Merc = UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  587. Mero = UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  588. Mlym = UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  589. Modi = UCD_SCRIPT_Modi, /**< @brief Modi Script */
  590. Mong = UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  591. Moon = UCD_SCRIPT_Moon, /**< @brief Moon Script */
  592. Mroo = UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  593. Mtei = UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  594. Mult = UCD_SCRIPT_Mult, /**< @brief Multani Script */
  595. Mymr = UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  596. Narb = UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  597. Nbat = UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  598. Nkgb = UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  599. Nkoo = UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  600. Nshu = UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  601. Ogam = UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  602. Olck = UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  603. Orkh = UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  604. Orya = UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  605. Osma = UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  606. Palm = UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  607. Pauc = UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  608. Perm = UCD_SCRIPT_Perm, /**< @brief Old Permic */
  609. Phag = UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  610. Phli = UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  611. Phlp = UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  612. Phlv = UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  613. Phnx = UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  614. Plrd = UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  615. Prti = UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  616. Qaak = UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  617. Rjng = UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  618. Roro = UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  619. Runr = UCD_SCRIPT_Runr, /**< @brief Runic Script */
  620. Samr = UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  621. Sara = UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  622. Sarb = UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  623. Saur = UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  624. Sgnw = UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  625. Shaw = UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  626. Shrd = UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  627. Sidd = UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  628. Sind = UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  629. Sinh = UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  630. Sora = UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  631. Sund = UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  632. Sylo = UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  633. Syrc = UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  634. Syre = UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  635. Syrj = UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  636. Syrn = UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  637. Tagb = UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  638. Takr = UCD_SCRIPT_Takr, /**< @brief Takri Script */
  639. Tale = UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  640. Talu = UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  641. Taml = UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  642. Tang = UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  643. Tavt = UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  644. Telu = UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  645. Teng = UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  646. Tfng = UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  647. Tglg = UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  648. Thaa = UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  649. Thai = UCD_SCRIPT_Thai, /**< @brief Thai Script */
  650. Tibt = UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  651. Tirh = UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  652. Ugar = UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  653. Vaii = UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  654. Visp = UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  655. Wara = UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  656. Wole = UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  657. Xpeo = UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  658. Xsux = UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  659. Yiii = UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  660. Zinh = UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  661. Zmth = UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  662. Zsym = UCD_SCRIPT_Zsym, /**< @brief Symbols */
  663. Zxxx = UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  664. Zyyy = UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  665. Zzzz = UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  666. };
  667. /** @brief Get a string representation of the script enumeration value.
  668. *
  669. * @param s The value to get the string representation for.
  670. *
  671. * @return The string representation, or "----" if the value is not recognized.
  672. */
  673. inline const char *get_script_string(script s)
  674. {
  675. return ucd_get_script_string((ucd_script)s);
  676. }
  677. /** @brief Lookup the Script for a Unicode codepoint.
  678. *
  679. * @param c The Unicode codepoint to lookup.
  680. * @return The Script of the Unicode codepoint.
  681. */
  682. script lookup_script(codepoint_t c);
  683. /** @brief Is the codepoint an alpha-numeric character?
  684. *
  685. * @param c The Unicode codepoint to check.
  686. * @return Non-zero if the codepoint is a letter or number, zero otherwise.
  687. */
  688. int isalnum(codepoint_t c);
  689. /** @brief Is the codepoint a letter?
  690. *
  691. * @param c The Unicode codepoint to check.
  692. * @return Non-zero if the codepoint is a letter, zero otherwise.
  693. */
  694. int isalpha(codepoint_t c);
  695. /** @brief Is the codepoint a control character?
  696. *
  697. * @param c The Unicode codepoint to check.
  698. * @return Non-zero if the codepoint is a control character, zero otherwise.
  699. */
  700. int iscntrl(codepoint_t c);
  701. /** @brief Is the codepoint a numeric character?
  702. *
  703. * @param c The Unicode codepoint to check.
  704. * @return Non-zero if the codepoint is a number, zero otherwise.
  705. */
  706. int isdigit(codepoint_t c);
  707. /** @brief Does the codepoint have a displayable glyph?
  708. *
  709. * @param c The Unicode codepoint to check.
  710. * @return Non-zero if the codepoint has a displayable glyph, zero otherwise.
  711. */
  712. int isgraph(codepoint_t c);
  713. /** @brief Is the codepoint a lower-case letter?
  714. *
  715. * @param c The Unicode codepoint to check.
  716. * @return Non-zero if the codepoint is a lower-case letter, zero otherwise.
  717. */
  718. int islower(codepoint_t c);
  719. /** @brief Is the codepoint a printable character?
  720. *
  721. * @param c The Unicode codepoint to check.
  722. * @return Non-zero if the codepoint is a printable character, zero otherwise.
  723. */
  724. int isprint(codepoint_t c);
  725. /** @brief Is the codepoint a punctuation character?
  726. *
  727. * @param c The Unicode codepoint to check.
  728. * @return Non-zero if the codepoint is a punctuation character, zero otherwise.
  729. */
  730. int ispunct(codepoint_t c);
  731. /** @brief Is the codepoint a whitespace character?
  732. *
  733. * @param c The Unicode codepoint to check.
  734. * @return Non-zero if the codepoint is a whitespace character, zero otherwise.
  735. */
  736. int isspace(codepoint_t c);
  737. /** @brief Is the codepoint an upper-case letter?
  738. *
  739. * @param c The Unicode codepoint to check.
  740. * @return Non-zero if the codepoint is an upper-case letter, zero otherwise.
  741. */
  742. int isupper(codepoint_t c);
  743. /** @brief Convert the Unicode codepoint to upper-case.
  744. *
  745. * This function only uses the simple case mapping present in the
  746. * UnicodeData file. The data in SpecialCasing requires Unicode
  747. * codepoints to be mapped to multiple codepoints.
  748. *
  749. * @param c The Unicode codepoint to convert.
  750. * @return The upper-case Unicode codepoint for this codepoint, or
  751. * this codepoint if there is no upper-case codepoint.
  752. */
  753. codepoint_t toupper(codepoint_t c);
  754. /** @brief Convert the Unicode codepoint to lower-case.
  755. *
  756. * This function only uses the simple case mapping present in the
  757. * UnicodeData file. The data in SpecialCasing requires Unicode
  758. * codepoints to be mapped to multiple codepoints.
  759. *
  760. * @param c The Unicode codepoint to convert.
  761. * @return The lower-case Unicode codepoint for this codepoint, or
  762. * this codepoint if there is no upper-case codepoint.
  763. */
  764. codepoint_t tolower(codepoint_t c);
  765. /** @brief Convert the Unicode codepoint to title-case.
  766. *
  767. * This function only uses the simple case mapping present in the
  768. * UnicodeData file. The data in SpecialCasing requires Unicode
  769. * codepoints to be mapped to multiple codepoints.
  770. *
  771. * @param c The Unicode codepoint to convert.
  772. * @return The title-case Unicode codepoint for this codepoint, or
  773. * this codepoint if there is no upper-case codepoint.
  774. */
  775. codepoint_t totitle(codepoint_t c);
  776. }
  777. #endif
  778. #endif