123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- /* Unicode Character Database API
- *
- * Copyright (C) 2012 Reece H. Dunn
- *
- * This file is part of ucd-tools.
- *
- * ucd-tools is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ucd-tools is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
- */
-
- #ifndef UNICODE_CHARACTER_DATA_H
- #define UNICODE_CHARACTER_DATA_H
-
- #include <stdint.h>
-
- /** @brief Unicode Character Database
- */
- namespace ucd
- {
- /** @brief Represents a Unicode codepoint.
- */
- typedef uint32_t codepoint_t;
-
- /** @name Unicode General Category
- * @brief These functions query the General Category property of Unicode codepoints.
- */
- //@{
-
- /** @brief Unicode General Category Groups
- * @see http://www.unicode.org/reports/tr44/
- */
- enum category_group
- {
- C, /**< @brief Other */
- I, /**< @brief Invalid */
- L, /**< @brief Letter */
- M, /**< @brief Mark */
- N, /**< @brief Number */
- P, /**< @brief Punctuation */
- S, /**< @brief Symbol */
- Z, /**< @brief Separator */
- };
-
- /** @brief Unicode General Category Values
- * @see http://www.unicode.org/reports/tr44/
- */
- enum category
- {
- Cc, /**< @brief Control Character */
- Cf, /**< @brief Format Control Character */
- Cn, /**< @brief Unassigned */
- Co, /**< @brief Private Use */
- Cs, /**< @brief Surrogate Code Point */
-
- Ii, /**< @brief Invalid Unicode Codepoint */
-
- Ll, /**< @brief Lower Case Letter */
- Lm, /**< @brief Letter Modifier */
- Lo, /**< @brief Other Letter */
- Lt, /**< @brief Title Case Letter */
- Lu, /**< @brief Upper Case Letter */
-
- Mc, /**< @brief Spacing Mark */
- Me, /**< @brief Enclosing Mark */
- Mn, /**< @brief Non-Spacing Mark */
-
- Nd, /**< @brief Decimal Digit */
- Nl, /**< @brief Letter-Like Number */
- No, /**< @brief Other Number */
-
- Pc, /**< @brief Connector */
- Pd, /**< @brief Dash/Hyphen */
- Pe, /**< @brief Close Punctuation Mark */
- Pf, /**< @brief Final Quotation Mark */
- Pi, /**< @brief Initial Quotation Mark */
- Po, /**< @brief Other */
- Ps, /**< @brief Open PUnctuation Mark */
-
- Sc, /**< @brief Currency Symbol */
- Sk, /**< @brief Modifier Symbol */
- Sm, /**< @brief Math Symbol */
- So, /**< @brief Other Symbol */
-
- Zl, /**< @brief Line Separator */
- Zp, /**< @brief Paragraph Separator */
- Zs, /**< @brief Space Separator */
- };
-
- /** @brief Lookup the General Category Group for a General Category.
- *
- * @param c The General Category to lookup.
- * @return The General Category Group of the General Category.
- */
- category_group lookup_category_group(category c);
-
- /** @brief Lookup the General Category Group for a Unicode codepoint.
- *
- * @param c The Unicode codepoint to lookup.
- * @return The General Category Group of the Unicode codepoint.
- */
- category_group lookup_category_group(codepoint_t c);
-
- /** @brief Lookup the General Category for a Unicode codepoint.
- *
- * @param c The Unicode codepoint to lookup.
- * @return The General Category of the Unicode codepoint.
- */
- category lookup_category(codepoint_t c);
-
-
- //@}
- /** @name Unicode Script
- * @brief These functions query the Script property of Unicode codepoints.
- */
- //@{
-
-
- /** @brief Unicode Script
- * @see http://www.iana.org/assignments/language-subtag-registry
- * @see http://www.unicode.org/iso15924/iso15924-codes.html
- */
- enum script
- {
- Arab, /**< @brief Arabic Script */
- Armi, /**< @brief Imperial Aramaic Script */
- Armn, /**< @brief Armenian Script */
- Avst, /**< @brief Avestan Script */
- Bali, /**< @brief Balinese Script */
- Bamu, /**< @brief Bamum Script */
- Batk, /**< @brief Batak Script */
- Beng, /**< @brief Bengali Script */
- Bopo, /**< @brief Bopomofo Script */
- Brah, /**< @brief Brahmi Script */
- Brai, /**< @brief Braille Script */
- Bugi, /**< @brief Buginese Script */
- Buhd, /**< @brief Buhid Script */
- Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
- Cari, /**< @brief Carian Script */
- Cakm, /**< @brief Chakma Script */
- Cham, /**< @brief Cham Script */
- Cher, /**< @brief Cherokee Script */
- Copt, /**< @brief Coptic Script */
- Cprt, /**< @brief Cypriot Script */
- Cyrl, /**< @brief Cyrillic Script */
- Deva, /**< @brief Devanagari Script */
- Dsrt, /**< @brief Deseret Script */
- Egyp, /**< @brief Egyptian Hiegoglyphs */
- Ethi, /**< @brief Ethiopic Script */
- Geor, /**< @brief Geirgian Script */
- Glag, /**< @brief Glagolitic Script */
- Goth, /**< @brief Gothic Script */
- Grek, /**< @brief Greek Script */
- Gujr, /**< @brief Gujarati Script */
- Guru, /**< @brief Gurmukhi Script */
- Hang, /**< @brief Hangul Script */
- Hano, /**< @brief Hanunoo Script */
- Hant, /**< @brief Han (Traditional) Script */
- Hebr, /**< @brief Hebrew Script */
- Hira, /**< @brief Hiragana Script */
- Ital, /**< @brief Old Italic Script */
- Java, /**< @brief Javanese Script */
- Kali, /**< @brief Kayah Li Script */
- Kana, /**< @brief Katakana Script */
- Khar, /**< @brief Kharoshthi Script */
- Khmr, /**< @brief Khmer Script */
- Knda, /**< @brief Kannada Script */
- Kthi, /**< @brief Kaithi Script */
- Lana, /**< @brief Tai Tham Script */
- Laoo, /**< @brief Lao Script */
- Latn, /**< @brief Latin Script */
- Lepc, /**< @brief Lepcha Script */
- Limb, /**< @brief Limbu Script */
- Linb, /**< @brief Linear B Script */
- Lisu, /**< @brief Lisu Script */
- Lyci, /**< @brief Lycian Script */
- Lydi, /**< @brief Lydian Script */
- Mand, /**< @brief Mandaic Script */
- Merc, /**< @brief Meroitic Cursive Script */
- Mero, /**< @brief Meroitic Hieroglyphs */
- Mlym, /**< @brief Malayalam Script */
- Mong, /**< @brief Mongolian Script */
- Mtei, /**< @brief Meitei Mayek Script */
- Mymr, /**< @brief Myanmar Script */
- Nkoo, /**< @brief N'Ko Script */
- Ogam, /**< @brief Ogham Script */
- Olck, /**< @brief Ol Chiki Script */
- Orkh, /**< @brief Old Turkic Script */
- Orya, /**< @brief Oriya Script */
- Osma, /**< @brief Osmanya Script */
- Phag, /**< @brief Phags-Pa Script */
- Phli, /**< @brief Inscriptional Pahlavi Script */
- Phnx, /**< @brief Phoenician Script */
- Plrd, /**< @brief Miao Script */
- Prti, /**< @brief Inscriptional Parthian Script */
- Rjng, /**< @brief Rejang Script */
- Runr, /**< @brief Runic Script */
- Samr, /**< @brief Samaritan Script */
- Sarb, /**< @brief Old South Arabian Script */
- Saur, /**< @brief Saurashtra Script */
- Shaw, /**< @brief Shavian Script */
- Shrd, /**< @brief Sharada Script */
- Sinh, /**< @brief Sinhala Script */
- Sora, /**< @brief Sora Sompeng Script */
- Sund, /**< @brief Sundanese Script */
- Sylo, /**< @brief Syloti Nagri Script */
- Syrn, /**< @brief Syriatic (Eastern) Script */
- Tagb, /**< @brief Tagbanwa Script */
- Takr, /**< @brief Takri Script */
- Tale, /**< @brief Tai Le Script */
- Talu, /**< @brief New Tai Lue Script */
- Taml, /**< @brief Tamil Script */
- Tavt, /**< @brief Tai Viet Script */
- Telu, /**< @brief Telugu Script */
- Tfng, /**< @brief Tifinagh Script */
- Tglg, /**< @brief Tagalog Script */
- Thaa, /**< @brief Thaana Script */
- Thai, /**< @brief Thai Script */
- Tibt, /**< @brief Tibetan Script */
- Ugar, /**< @brief Ugaritic Script */
- Vaii, /**< @brief Vai Script */
- Xpeo, /**< @brief Old Persian Script */
- Xsux, /**< @brief Cuneiform Script */
- Yiii, /**< @brief Yi Script */
- Zyyy, /**< @brief Inherited Script */
- Zzzz, /**< @brief Unknown Script */
- };
-
- /** @brief Lookup the Script for a Unicode codepoint.
- *
- * @param c The Unicode codepoint to lookup.
- * @return The Script of the Unicode codepoint.
- */
- script lookup_script(codepoint_t c);
-
-
- //@}
- /** @name ctype-style APIs
- * @brief These functions provide wctype compatible functions using the UCD data.
- */
- //@{
-
-
- /** @brief Is the codepoint an alpha-numeric character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a letter or number, zero otherwise.
- */
- int isalnum(codepoint_t c);
-
- /** @brief Is the codepoint a letter?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a letter, zero otherwise.
- */
- int isalpha(codepoint_t c);
-
- /** @brief Is the codepoint a control character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a control character, zero otherwise.
- */
- int iscntrl(codepoint_t c);
-
- /** @brief Is the codepoint a numeric character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a number, zero otherwise.
- */
- int isdigit(codepoint_t c);
-
- /** @brief Does the codepoint have a displayable glyph?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint has a displayable glyph, zero otherwise.
- */
- int isgraph(codepoint_t c);
-
- /** @brief Is the codepoint a lower-case letter?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a lower-case letter, zero otherwise.
- */
- int islower(codepoint_t c);
-
- /** @brief Is the codepoint a printable character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a printable character, zero otherwise.
- */
- int isprint(codepoint_t c);
-
- /** @brief Is the codepoint a punctuation character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a punctuation character, zero otherwise.
- */
- int ispunct(codepoint_t c);
-
- /** @brief Is the codepoint a whitespace character?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is a whitespace character, zero otherwise.
- */
- int isspace(codepoint_t c);
-
- /** @brief Is the codepoint an upper-case letter?
- *
- * @param c The Unicode codepoint to check.
- * @return Non-zero if the codepoint is an upper-case letter, zero otherwise.
- */
- int isupper(codepoint_t c);
-
-
- //@}
- /** @name Case Conversion APIs
- * @brief These functions convert Unicode codepoints between lower, upper and title case.
- */
- //@{
-
-
- /** @brief Convert the Unicode codepoint to upper-case.
- *
- * This function only uses the simple case mapping present in the
- * UnicodeData file. The data in SpecialCasing requires Unicode
- * codepoints to be mapped to multiple codepoints.
- *
- * @param c The Unicode codepoint to convert.
- * @return The upper-case Unicode codepoint for this codepoint, or
- * this codepoint if there is no upper-case codepoint.
- */
- codepoint_t toupper(codepoint_t c);
-
- /** @brief Convert the Unicode codepoint to lower-case.
- *
- * This function only uses the simple case mapping present in the
- * UnicodeData file. The data in SpecialCasing requires Unicode
- * codepoints to be mapped to multiple codepoints.
- *
- * @param c The Unicode codepoint to convert.
- * @return The lower-case Unicode codepoint for this codepoint, or
- * this codepoint if there is no upper-case codepoint.
- */
- codepoint_t tolower(codepoint_t c);
-
- /** @brief Convert the Unicode codepoint to title-case.
- *
- * This function only uses the simple case mapping present in the
- * UnicodeData file. The data in SpecialCasing requires Unicode
- * codepoints to be mapped to multiple codepoints.
- *
- * @param c The Unicode codepoint to convert.
- * @return The title-case Unicode codepoint for this codepoint, or
- * this codepoint if there is no upper-case codepoint.
- */
- codepoint_t totitle(codepoint_t c);
-
-
- //@}
- }
-
- #endif
|