/* Unicode Character Database API
*
* Copyright (C) 2012 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see .
*/
#ifndef UNICODE_CHARACTER_DATA_H
#define UNICODE_CHARACTER_DATA_H
#include
/** @brief Unicode Character Database
*/
namespace ucd
{
/** @brief Represents a Unicode codepoint.
*/
typedef uint32_t codepoint_t;
/** @name Unicode General Category
* @brief These functions query the General Category property of Unicode codepoints.
*/
//@{
/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
enum category_group
{
C, /**< @brief Other */
I, /**< @brief Invalid */
L, /**< @brief Letter */
M, /**< @brief Mark */
N, /**< @brief Number */
P, /**< @brief Punctuation */
S, /**< @brief Symbol */
Z, /**< @brief Separator */
};
/** @brief Unicode General Category Values
* @see http://www.unicode.org/reports/tr44/
*/
enum category
{
Cc, /**< @brief Control Character */
Cf, /**< @brief Format Control Character */
Cn, /**< @brief Unassigned */
Co, /**< @brief Private Use */
Cs, /**< @brief Surrogate Code Point */
Ii, /**< @brief Invalid Unicode Codepoint */
Ll, /**< @brief Lower Case Letter */
Lm, /**< @brief Letter Modifier */
Lo, /**< @brief Other Letter */
Lt, /**< @brief Title Case Letter */
Lu, /**< @brief Upper Case Letter */
Mc, /**< @brief Spacing Mark */
Me, /**< @brief Enclosing Mark */
Mn, /**< @brief Non-Spacing Mark */
Nd, /**< @brief Decimal Digit */
Nl, /**< @brief Letter-Like Number */
No, /**< @brief Other Number */
Pc, /**< @brief Connector */
Pd, /**< @brief Dash/Hyphen */
Pe, /**< @brief Close Punctuation Mark */
Pf, /**< @brief Final Quotation Mark */
Pi, /**< @brief Initial Quotation Mark */
Po, /**< @brief Other */
Ps, /**< @brief Open PUnctuation Mark */
Sc, /**< @brief Currency Symbol */
Sk, /**< @brief Modifier Symbol */
Sm, /**< @brief Math Symbol */
So, /**< @brief Other Symbol */
Zl, /**< @brief Line Separator */
Zp, /**< @brief Paragraph Separator */
Zs, /**< @brief Space Separator */
};
/** @brief Lookup the General Category Group for a General Category.
*
* @param c The General Category to lookup.
* @return The General Category Group of the General Category.
*/
category_group lookup_category_group(category c);
/** @brief Lookup the General Category Group for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category Group of the Unicode codepoint.
*/
category_group lookup_category_group(codepoint_t c);
/** @brief Lookup the General Category for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category of the Unicode codepoint.
*/
category lookup_category(codepoint_t c);
//@}
/** @name Unicode Script
* @brief These functions query the Script property of Unicode codepoints.
*/
//@{
/** @brief Unicode Script
* @see http://www.iana.org/assignments/language-subtag-registry
* @see http://www.unicode.org/iso15924/iso15924-codes.html
*/
enum script
{
Arab, /**< @brief Arabic Script */
Armi, /**< @brief Imperial Aramaic Script */
Armn, /**< @brief Armenian Script */
Avst, /**< @brief Avestan Script */
Bali, /**< @brief Balinese Script */
Bamu, /**< @brief Bamum Script */
Batk, /**< @brief Batak Script */
Beng, /**< @brief Bengali Script */
Bopo, /**< @brief Bopomofo Script */
Brah, /**< @brief Brahmi Script */
Brai, /**< @brief Braille Script */
Bugi, /**< @brief Buginese Script */
Buhd, /**< @brief Buhid Script */
Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
Cari, /**< @brief Carian Script */
Cakm, /**< @brief Chakma Script */
Cham, /**< @brief Cham Script */
Cher, /**< @brief Cherokee Script */
Copt, /**< @brief Coptic Script */
Cprt, /**< @brief Cypriot Script */
Cyrl, /**< @brief Cyrillic Script */
Deva, /**< @brief Devanagari Script */
Dsrt, /**< @brief Deseret Script */
Egyp, /**< @brief Egyptian Hiegoglyphs */
Ethi, /**< @brief Ethiopic Script */
Geor, /**< @brief Geirgian Script */
Glag, /**< @brief Glagolitic Script */
Goth, /**< @brief Gothic Script */
Grek, /**< @brief Greek Script */
Gujr, /**< @brief Gujarati Script */
Guru, /**< @brief Gurmukhi Script */
Hang, /**< @brief Hangul Script */
Hano, /**< @brief Hanunoo Script */
Hant, /**< @brief Han (Traditional) Script */
Hebr, /**< @brief Hebrew Script */
Hira, /**< @brief Hiragana Script */
Ital, /**< @brief Old Italic Script */
Java, /**< @brief Javanese Script */
Kali, /**< @brief Kayah Li Script */
Kana, /**< @brief Katakana Script */
Khar, /**< @brief Kharoshthi Script */
Khmr, /**< @brief Khmer Script */
Knda, /**< @brief Kannada Script */
Kthi, /**< @brief Kaithi Script */
Lana, /**< @brief Tai Tham Script */
Laoo, /**< @brief Lao Script */
Latn, /**< @brief Latin Script */
Lepc, /**< @brief Lepcha Script */
Limb, /**< @brief Limbu Script */
Linb, /**< @brief Linear B Script */
Lisu, /**< @brief Lisu Script */
Lyci, /**< @brief Lycian Script */
Lydi, /**< @brief Lydian Script */
Mand, /**< @brief Mandaic Script */
Merc, /**< @brief Meroitic Cursive Script */
Mero, /**< @brief Meroitic Hieroglyphs */
Mlym, /**< @brief Malayalam Script */
Mong, /**< @brief Mongolian Script */
Mtei, /**< @brief Meitei Mayek Script */
Mymr, /**< @brief Myanmar Script */
Nkoo, /**< @brief N'Ko Script */
Ogam, /**< @brief Ogham Script */
Olck, /**< @brief Ol Chiki Script */
Orkh, /**< @brief Old Turkic Script */
Orya, /**< @brief Oriya Script */
Osma, /**< @brief Osmanya Script */
Phag, /**< @brief Phags-Pa Script */
Phli, /**< @brief Inscriptional Pahlavi Script */
Phnx, /**< @brief Phoenician Script */
Plrd, /**< @brief Miao Script */
Prti, /**< @brief Inscriptional Parthian Script */
Rjng, /**< @brief Rejang Script */
Runr, /**< @brief Runic Script */
Samr, /**< @brief Samaritan Script */
Sarb, /**< @brief Old South Arabian Script */
Saur, /**< @brief Saurashtra Script */
Shaw, /**< @brief Shavian Script */
Shrd, /**< @brief Sharada Script */
Sinh, /**< @brief Sinhala Script */
Sora, /**< @brief Sora Sompeng Script */
Sund, /**< @brief Sundanese Script */
Sylo, /**< @brief Syloti Nagri Script */
Syrn, /**< @brief Syriatic (Eastern) Script */
Tagb, /**< @brief Tagbanwa Script */
Takr, /**< @brief Takri Script */
Tale, /**< @brief Tai Le Script */
Talu, /**< @brief New Tai Lue Script */
Taml, /**< @brief Tamil Script */
Tavt, /**< @brief Tai Viet Script */
Telu, /**< @brief Telugu Script */
Tfng, /**< @brief Tifinagh Script */
Tglg, /**< @brief Tagalog Script */
Thaa, /**< @brief Thaana Script */
Thai, /**< @brief Thai Script */
Tibt, /**< @brief Tibetan Script */
Ugar, /**< @brief Ugaritic Script */
Vaii, /**< @brief Vai Script */
Xpeo, /**< @brief Old Persian Script */
Xsux, /**< @brief Cuneiform Script */
Yiii, /**< @brief Yi Script */
Zyyy, /**< @brief Inherited Script */
Zzzz, /**< @brief Unknown Script */
};
/** @brief Lookup the Script for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The Script of the Unicode codepoint.
*/
script lookup_script(codepoint_t c);
//@}
/** @name ctype-style APIs
* @brief These functions provide wctype compatible functions using the UCD data.
*/
//@{
/** @brief Is the codepoint an alpha-numeric character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a letter or number, zero otherwise.
*/
int isalnum(codepoint_t c);
/** @brief Is the codepoint a letter?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a letter, zero otherwise.
*/
int isalpha(codepoint_t c);
/** @brief Is the codepoint a control character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a control character, zero otherwise.
*/
int iscntrl(codepoint_t c);
/** @brief Is the codepoint a numeric character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a number, zero otherwise.
*/
int isdigit(codepoint_t c);
/** @brief Does the codepoint have a displayable glyph?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint has a displayable glyph, zero otherwise.
*/
int isgraph(codepoint_t c);
/** @brief Is the codepoint a lower-case letter?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a lower-case letter, zero otherwise.
*/
int islower(codepoint_t c);
/** @brief Is the codepoint a printable character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a printable character, zero otherwise.
*/
int isprint(codepoint_t c);
/** @brief Is the codepoint a punctuation character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a punctuation character, zero otherwise.
*/
int ispunct(codepoint_t c);
/** @brief Is the codepoint a whitespace character?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is a whitespace character, zero otherwise.
*/
int isspace(codepoint_t c);
/** @brief Is the codepoint an upper-case letter?
*
* @param c The Unicode codepoint to check.
* @return Non-zero if the codepoint is an upper-case letter, zero otherwise.
*/
int isupper(codepoint_t c);
//@}
/** @name Case Conversion APIs
* @brief These functions convert Unicode codepoints between lower, upper and title case.
*/
//@{
/** @brief Convert the Unicode codepoint to upper-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The upper-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t toupper(codepoint_t c);
/** @brief Convert the Unicode codepoint to lower-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The lower-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t tolower(codepoint_t c);
/** @brief Convert the Unicode codepoint to title-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The title-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t totitle(codepoint_t c);
//@}
}
#endif