@@ -4,6 +4,7 @@ | |||
* Add `iswblank` and `iswxdigit` compatibility. | |||
* Improve ctype compatibility. | |||
* PropList property lookup. | |||
## 9.0.0 - 2016-12-28 | |||
@@ -122,6 +122,7 @@ src_libucd_la_SOURCES = \ | |||
src/case.c \ | |||
src/categories.c \ | |||
src/ctype.c \ | |||
src/proplist.c \ | |||
src/scripts.c \ | |||
src/tostring.c | |||
@@ -65,10 +65,11 @@ Doxygen documentation format. | |||
The library exposes the following properties from the UCD data files: | |||
| Property | Description | | |||
|--------------------|-------------| | |||
| `General_Category` | A [General Category Value](http://www.unicode.org/reports/tr44/#General_Category_Values), including the higher-level grouping. | | |||
| `Script` | An [ISO 15924](http://www.unicode.org/iso15924/iso15924-codes.html) script code. | | |||
| C API | C++ API | Data | Description | | |||
|-----------------------|------------------------|-------------|-------------| | |||
| `ucd_lookup_category` | `ucd::lookup_category` | UnicodeData | A [General Category Value](http://www.unicode.org/reports/tr44/#General_Category_Values). | | |||
| `ucd_lookup_script` | `ucd::lookup_script` | Script | An [ISO 15924](http://www.unicode.org/iso15924/iso15924-codes.html) script code. | | |||
| `ucd_properties` | `ucd::properties` | PropList | The code point properties from the PropList Unicode data file. | | |||
### Case Conversion | |||
@@ -20,262 +20,10 @@ | |||
#include "ucd/ucd.h" | |||
static int other_alphabetic_MnMcSo(codepoint_t c) | |||
{ | |||
switch (c & 0xFFFFFF00) | |||
{ | |||
case 0x0300: | |||
return c == 0x0345; | |||
case 0x0500: | |||
return (c >= 0x05B0 && c <= 0x05BD) | |||
|| c == 0x05BF | |||
|| (c >= 0x05C1 && c <= 0x05C2) | |||
|| (c >= 0x05C4 && c <= 0x05C5) | |||
|| c == 0x05C7; | |||
case 0x0600: | |||
return (c >= 0x0610 && c <= 0x061A) | |||
|| (c >= 0x064B && c <= 0x0657) | |||
|| (c >= 0x0659 && c <= 0x065F) | |||
|| c == 0x0670 | |||
|| (c >= 0x06D6 && c <= 0x06DC) | |||
|| (c >= 0x06E1 && c <= 0x06E4) | |||
|| (c >= 0x06E7 && c <= 0x06E8) | |||
|| c == 0x06ED; | |||
case 0x0700: | |||
return c == 0x0711 | |||
|| (c >= 0x0730 && c <= 0x073F) | |||
|| (c >= 0x07A6 && c <= 0x07B0); | |||
case 0x0800: | |||
return (c >= 0x0816 && c <= 0x0817) | |||
|| (c >= 0x081B && c <= 0x0823) | |||
|| (c >= 0x0825 && c <= 0x0827) | |||
|| (c >= 0x0829 && c <= 0x082C) | |||
|| (c >= 0x08D4 && c <= 0x08DF) | |||
|| (c >= 0x08E3 && c <= 0x08E9) | |||
|| c >= 0x08F0; | |||
case 0x0900: | |||
return (c >= 0x0900 && c <= 0x0903) | |||
|| (c >= 0x093A && c <= 0x093B) | |||
|| (c >= 0x093E && c <= 0x094C) | |||
|| (c >= 0x094E && c <= 0x094F) | |||
|| (c >= 0x0955 && c <= 0x0957) | |||
|| (c >= 0x0962 && c <= 0x0963) | |||
|| (c >= 0x0981 && c <= 0x0983) | |||
|| (c >= 0x0981 && c <= 0x0983) | |||
|| (c >= 0x09BE && c <= 0x09C4) | |||
|| (c >= 0x09C7 && c <= 0x09C8) | |||
|| (c >= 0x09CB && c <= 0x09CC) | |||
|| c == 0x09D7 | |||
|| (c >= 0x09E2 && c <= 0x09E3); | |||
case 0x0A00: | |||
return (c >= 0x0A01 && c <= 0x0A03) | |||
|| (c >= 0x0A3E && c <= 0x0A42) | |||
|| (c >= 0x0A47 && c <= 0x0A48) | |||
|| (c >= 0x0A4B && c <= 0x0A4C) | |||
|| c == 0x0A51 | |||
|| (c >= 0x0A70 && c <= 0x0A71) | |||
|| c == 0x0A75 | |||
|| (c >= 0x0A81 && c <= 0x0A83) | |||
|| (c >= 0x0ABE && c <= 0x0AC5) | |||
|| (c >= 0x0AC7 && c <= 0x0AC9) | |||
|| (c >= 0x0ACB && c <= 0x0ACC) | |||
|| (c >= 0x0AE2 && c <= 0x0AE3); | |||
case 0x0B00: | |||
return (c >= 0x0B01 && c <= 0x0B03) | |||
|| (c >= 0x0B3E && c <= 0x0B44) | |||
|| (c >= 0x0B47 && c <= 0x0B48) | |||
|| (c >= 0x0B4B && c <= 0x0B4C) | |||
|| (c >= 0x0B56 && c <= 0x0B57) | |||
|| (c >= 0x0B62 && c <= 0x0B63) | |||
|| c == 0x0B82 | |||
|| (c >= 0x0BBE && c <= 0x0BC2) | |||
|| (c >= 0x0BC6 && c <= 0x0BC8) | |||
|| (c >= 0x0BCA && c <= 0x0BCC) | |||
|| c == 0x0BD7; | |||
case 0x0C00: | |||
return (c >= 0x0C00 && c <= 0x0C03) | |||
|| (c >= 0x0C3E && c <= 0x0C44) | |||
|| (c >= 0x0C46 && c <= 0x0C48) | |||
|| (c >= 0x0C4A && c <= 0x0C4C) | |||
|| (c >= 0x0C55 && c <= 0x0C56) | |||
|| (c >= 0x0C62 && c <= 0x0C63) | |||
|| (c >= 0x0C81 && c <= 0x0C83) | |||
|| (c >= 0x0CBE && c <= 0x0CBF) | |||
|| (c >= 0x0CC0 && c <= 0x0CC4) | |||
|| (c >= 0x0CC6 && c <= 0x0CC8) | |||
|| (c >= 0x0CCA && c <= 0x0CCC) | |||
|| (c >= 0x0CD5 && c <= 0x0CD6) | |||
|| (c >= 0x0CE2 && c <= 0x0CE3); | |||
case 0x0D00: | |||
return (c >= 0x0D01 && c <= 0x0D03) | |||
|| (c >= 0x0D3E && c <= 0x0D44) | |||
|| (c >= 0x0D46 && c <= 0x0D48) | |||
|| (c >= 0x0D4A && c <= 0x0D4C) | |||
|| c == 0x0D57 | |||
|| (c >= 0x0D62 && c <= 0x0D63) | |||
|| (c >= 0x0D82 && c <= 0x0D83) | |||
|| (c >= 0x0DCF && c <= 0x0DD4) | |||
|| c == 0x0DD6 | |||
|| (c >= 0x0DD8 && c <= 0x0DDF) | |||
|| (c >= 0x0DF2 && c <= 0x0DF3); | |||
case 0x0E00: | |||
return c == 0x0E31 | |||
|| (c >= 0x0E34 && c <= 0x0E3A) | |||
|| c == 0x0E4D | |||
|| c == 0x0EB1 | |||
|| (c >= 0x0EB4 && c <= 0x0EB9) | |||
|| (c >= 0x0EBB && c <= 0x0EBD) | |||
|| c == 0x0ECD; | |||
case 0x0F00: | |||
return (c >= 0x0F71 && c <= 0x0F7F) | |||
|| (c >= 0x0F80 && c <= 0x0F81) | |||
|| (c >= 0x0F8D && c <= 0x0F97) | |||
|| (c >= 0x0F99 && c <= 0x0FBC); | |||
case 0x1000: | |||
return (c >= 0x102B && c <= 0x1036) | |||
|| c == 0x1038 | |||
|| (c >= 0x103B && c <= 0x103E) | |||
|| (c >= 0x1056 && c <= 0x1059) | |||
|| (c >= 0x105E && c <= 0x1060) | |||
|| c == 0x1062 | |||
|| (c >= 0x1067 && c <= 0x1068) | |||
|| (c >= 0x1071 && c <= 0x1074) | |||
|| (c >= 0x1082 && c <= 0x1086) | |||
|| (c >= 0x109C && c <= 0x109D); | |||
case 0x1300: | |||
return c == 0x135F; | |||
case 0x1700: | |||
return (c >= 0x1712 && c <= 0x1713) | |||
|| (c >= 0x1732 && c <= 0x1733) | |||
|| (c >= 0x1752 && c <= 0x1753) | |||
|| (c >= 0x1772 && c <= 0x1773) | |||
|| (c >= 0x17B6 && c <= 0x17C8); | |||
case 0x1800: | |||
return (c >= 0x1885 && c <= 0x1886) | |||
|| c == 0x18A9; | |||
case 0x1900: | |||
return (c >= 0x1920 && c <= 0x192B) | |||
|| (c >= 0x1930 && c <= 0x1938); | |||
case 0x1A00: | |||
return (c >= 0x1A17 && c <= 0x1A1B) | |||
|| (c >= 0x1A55 && c <= 0x1A5E) | |||
|| (c >= 0x1A61 && c <= 0x1A74); | |||
case 0x1B00: | |||
return (c >= 0x1B00 && c <= 0x1B04) | |||
|| (c >= 0x1B35 && c <= 0x1B43) | |||
|| (c >= 0x1B80 && c <= 0x1B82) | |||
|| (c >= 0x1BA1 && c <= 0x1BA9) | |||
|| (c >= 0x1BAC && c <= 0x1BAD) | |||
|| (c >= 0x1BE7 && c <= 0x1BF1); | |||
case 0x1C00: | |||
return (c >= 0x1C24 && c <= 0x1C35) | |||
|| (c >= 0x1CF2 && c <= 0x1CF3); | |||
case 0x1D00: | |||
return (c >= 0x1DE7 && c <= 0x1DF4); | |||
case 0x2400: | |||
return (c >= 0x24B6 && c <= 0x24E9); | |||
case 0x2D00: | |||
return (c >= 0x2DE0 && c <= 0x2DFF); | |||
case 0xA600: | |||
return (c >= 0xA674 && c <= 0xA67B) | |||
|| (c >= 0xA69E && c <= 0xA69F); | |||
case 0xA800: | |||
return (c >= 0xA823 && c <= 0xA827) | |||
|| (c >= 0xA880 && c <= 0xA881) | |||
|| (c >= 0xA8B4 && c <= 0xA8C3) | |||
|| c == 0xA8C5; | |||
case 0xA900: | |||
return (c >= 0xA926 && c <= 0xA92A) | |||
|| (c >= 0xA947 && c <= 0xA952) | |||
|| (c >= 0xA980 && c <= 0xA983) | |||
|| (c >= 0xA9B4 && c <= 0xA9BF); | |||
case 0xAA00: | |||
return (c >= 0xAA29 && c <= 0xAA36) | |||
|| c == 0xAA43 | |||
|| (c >= 0xAA4C && c <= 0xAA4D) | |||
|| c == 0xAAB0 | |||
|| (c >= 0xAAB2 && c <= 0xAAB4) | |||
|| (c >= 0xAAB7 && c <= 0xAAB8) | |||
|| c == 0xAABE | |||
|| (c >= 0xAAEB && c <= 0xAAEF) | |||
|| c == 0xAAF5; | |||
case 0xAB00: | |||
return (c >= 0xABE3 && c <= 0xABEA); | |||
case 0xFB00: | |||
return c == 0xFB1E; | |||
case 0x10300: | |||
return (c >= 0x10376 && c <= 0x1037A); | |||
case 0x10A00: | |||
return (c >= 0x10A01 && c <= 0x10A03) | |||
|| (c >= 0x10A05 && c <= 0x10A06) | |||
|| (c >= 0x10A0C && c <= 0x10A0F); | |||
case 0x11000: | |||
return (c >= 0x11000 && c <= 0x11002) | |||
|| (c >= 0x11038 && c <= 0x11045) | |||
|| c == 0x11082 | |||
|| (c >= 0x110B0 && c <= 0x110B8); | |||
case 0x11100: | |||
return (c >= 0x11100 && c <= 0x11102) | |||
|| (c >= 0x11127 && c <= 0x11132) | |||
|| (c >= 0x11180 && c <= 0x11182) | |||
|| (c >= 0x111B3 && c <= 0x111BF); | |||
case 0x11200: | |||
return (c >= 0x1122C && c <= 0x11234) | |||
|| c == 0x11237 | |||
|| c == 0x1123E | |||
|| (c >= 0x112DF && c <= 0x112E8); | |||
case 0x11300: | |||
return (c >= 0x11300 && c <= 0x11303) | |||
|| (c >= 0x1133E && c <= 0x11344) | |||
|| (c >= 0x11347 && c <= 0x11348) | |||
|| (c >= 0x1134B && c <= 0x1134C) | |||
|| c == 0x11357 | |||
|| (c >= 0x11362 && c <= 0x11363); | |||
case 0x11400: | |||
return (c >= 0x11435 && c <= 0x11441) | |||
|| (c >= 0x11443 && c <= 0x11445) | |||
|| (c >= 0x114B0 && c <= 0x114C1); | |||
case 0x11500: | |||
return (c >= 0x115AF && c <= 0x115B5) | |||
|| (c >= 0x115B8 && c <= 0x115BE) | |||
|| (c >= 0x115DC && c <= 0x115DD); | |||
case 0x11600: | |||
return (c >= 0x11630 && c <= 0x1163E) | |||
|| c == 0x11640 | |||
|| (c >= 0x116AB && c <= 0x116B5); | |||
case 0x11700: | |||
return (c >= 0x1171D && c <= 0x1172A); | |||
case 0x11C00: | |||
return (c >= 0x11C2F && c <= 0x11C36) | |||
|| (c >= 0x11C38 && c <= 0x11C3E) | |||
|| (c >= 0x11C92 && c <= 0x11CA7) | |||
|| (c >= 0x11CA9 && c <= 0x11CB6); | |||
case 0x16B00: | |||
return (c >= 0x16B30 && c <= 0x16B36); | |||
case 0x16F00: | |||
return (c >= 0x16F51 && c <= 0x16F7E); | |||
case 0x1BC00: | |||
return c == 0x1BC9E; | |||
case 0x1E000: | |||
return (c >= 0x1E000 && c <= 0x1E006) | |||
|| (c >= 0x1E008 && c <= 0x1E018) | |||
|| (c >= 0x1E01B && c <= 0x1E021) | |||
|| (c >= 0x1E023 && c <= 0x1E024) | |||
|| (c >= 0x1E026 && c <= 0x1E02A); | |||
case 0x1E900: | |||
return c == 0x1E947; | |||
case 0x1F100: | |||
return (c >= 0x01F130 && c <= 0x01F149) | |||
|| (c >= 0x01F150 && c <= 0x01F169) | |||
|| (c >= 0x01F170 && c <= 0x01F189); | |||
default: | |||
return 0; | |||
} | |||
} | |||
int ucd_isalnum(codepoint_t c) | |||
{ | |||
switch (ucd_lookup_category(c)) | |||
ucd_category cat = ucd_lookup_category(c); | |||
switch (cat) | |||
{ | |||
case UCD_CATEGORY_Lu: | |||
case UCD_CATEGORY_Ll: | |||
@@ -289,7 +37,7 @@ int ucd_isalnum(codepoint_t c) | |||
case UCD_CATEGORY_Mn: | |||
case UCD_CATEGORY_Mc: | |||
case UCD_CATEGORY_So: | |||
return other_alphabetic_MnMcSo(c); | |||
return (ucd_properties(c, cat) & UCD_PROPERTY_OTHER_ALPHABETIC) == UCD_PROPERTY_OTHER_ALPHABETIC; | |||
default: | |||
return 0; | |||
} | |||
@@ -297,7 +45,8 @@ int ucd_isalnum(codepoint_t c) | |||
int ucd_isalpha(codepoint_t c) | |||
{ | |||
switch (ucd_lookup_category(c)) | |||
ucd_category cat = ucd_lookup_category(c); | |||
switch (cat) | |||
{ | |||
case UCD_CATEGORY_Lu: | |||
case UCD_CATEGORY_Ll: | |||
@@ -309,7 +58,7 @@ int ucd_isalpha(codepoint_t c) | |||
case UCD_CATEGORY_Mn: | |||
case UCD_CATEGORY_Mc: | |||
case UCD_CATEGORY_So: | |||
return other_alphabetic_MnMcSo(c); | |||
return (ucd_properties(c, cat) & UCD_PROPERTY_OTHER_ALPHABETIC) == UCD_PROPERTY_OTHER_ALPHABETIC; | |||
default: | |||
return 0; | |||
} | |||
@@ -366,37 +115,19 @@ int ucd_isgraph(codepoint_t c) | |||
int ucd_islower(codepoint_t c) | |||
{ | |||
switch (ucd_lookup_category(c)) | |||
ucd_category cat = ucd_lookup_category(c); | |||
switch (cat) | |||
{ | |||
case UCD_CATEGORY_Ll: | |||
return 1; | |||
case UCD_CATEGORY_Lt: | |||
return ucd_toupper(c) != c; | |||
case UCD_CATEGORY_Lo: | |||
return c == 0xAA // Other_Lowercase : FEMININE ORDINAL INDICATOR | |||
|| c == 0xBA; // Other_Lowercase : MASCULINE ORDINAL INDICATOR | |||
case UCD_CATEGORY_Lm: | |||
return (c >= 0x02B0 && c <= 0x02B8) // Other_Lowercase | |||
|| (c >= 0x02C0 && c <= 0x02C1) // Other_Lowercase | |||
|| (c >= 0x02E0 && c <= 0x02E4) // Other_Lowercase | |||
|| c == 0x037A // Other_Lowercase | |||
|| (c >= 0x1D2C && c <= 0x1D6A) // Other_Lowercase | |||
|| c == 0x1D78 // Other_Lowercase | |||
|| (c >= 0x1D9B && c <= 0x1DBF) // Other_Lowercase | |||
|| c == 0x2071 // Other_Lowercase | |||
|| c == 0x207F // Other_Lowercase | |||
|| (c >= 0x2090 && c <= 0x209C) // Other_Lowercase | |||
|| (c >= 0x2C7C && c <= 0x2C7D) // Other_Lowercase | |||
|| (c >= 0xA69C && c <= 0xA69D) // Other_Lowercase | |||
|| c == 0xA770 // Other_Lowercase | |||
|| (c >= 0xA7F8 && c <= 0xA7F9) // Other_Lowercase | |||
|| (c >= 0xAB5C && c <= 0xAB5F); // Other_Lowercase | |||
case UCD_CATEGORY_Lo: | |||
case UCD_CATEGORY_Mn: | |||
return c == 0x0345; // Other_Lowercase : COMBINING GREEK YPOGEGRAMMENI | |||
case UCD_CATEGORY_Nl: | |||
return (c >= 0x2170 && c <= 0x217F); // Other_Lowercase | |||
case UCD_CATEGORY_So: | |||
return (c >= 0x24D0 && c <= 0x24E9); // Other_Lowercase | |||
return (ucd_properties(c, cat) & UCD_PROPERTY_OTHER_LOWERCASE) == UCD_PROPERTY_OTHER_LOWERCASE; | |||
default: | |||
return 0; | |||
} | |||
@@ -469,19 +200,16 @@ int ucd_isspace(codepoint_t c) | |||
int ucd_isupper(codepoint_t c) | |||
{ | |||
switch (ucd_lookup_category(c)) | |||
ucd_category cat = ucd_lookup_category(c); | |||
switch (cat) | |||
{ | |||
case UCD_CATEGORY_Lu: | |||
return 1; | |||
case UCD_CATEGORY_Lt: | |||
return ucd_tolower(c) != c; | |||
case UCD_CATEGORY_Nl: | |||
return (c >= 0x002160 && c <= 0x00216F); // Other_Uppercase | |||
case UCD_CATEGORY_So: | |||
return (c >= 0x0024B6 && c <= 0x0024CF) // Other_Uppercase | |||
|| (c >= 0x01F130 && c <= 0x01F149) // Other_Uppercase | |||
|| (c >= 0x01F150 && c <= 0x01F169) // Other_Uppercase | |||
|| (c >= 0x01F170 && c <= 0x01F189); // Other_Uppercase | |||
return (ucd_properties(c, cat) & UCD_PROPERTY_OTHER_UPPERCASE) == UCD_PROPERTY_OTHER_UPPERCASE; | |||
default: | |||
return 0; | |||
} |
@@ -325,6 +325,51 @@ const char *ucd_get_script_string(ucd_script s); | |||
*/ | |||
ucd_script ucd_lookup_script(codepoint_t c); | |||
/** @brief Properties | |||
*/ | |||
typedef uint64_t ucd_property; | |||
static const ucd_property UCD_PROPERTY_WHITE_SPACE = 0x0000000000000001ull; /**< @brief White_Space */ | |||
static const ucd_property UCD_PROPERTY_BIDI_CONTROL = 0x0000000000000002ull; /**< @brief Bidi_Control */ | |||
static const ucd_property UCD_PROPERTY_JOIN_CONTROL = 0x0000000000000004ull; /**< @brief Join_Control */ | |||
static const ucd_property UCD_PROPERTY_DASH = 0x0000000000000008ull; /**< @brief Dash */ | |||
static const ucd_property UCD_PROPERTY_HYPHEN = 0x0000000000000010ull; /**< @brief Hyphen */ | |||
static const ucd_property UCD_PROPERTY_QUOTATION_MARK = 0x0000000000000020ull; /**< @brief Quotation_Mark */ | |||
static const ucd_property UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x0000000000000040ull; /**< @brief Terminal_Punctuation */ | |||
static const ucd_property UCD_PROPERTY_OTHER_MATH = 0x0000000000000080ull; /**< @brief Other_Math */ | |||
static const ucd_property UCD_PROPERTY_HEX_DIGIT = 0x0000000000000100ull; /**< @brief Hex_Digit */ | |||
static const ucd_property UCD_PROPERTY_ASCII_HEX_DIGIT = 0x0000000000000200ull; /**< @brief ASCII_Hex_Digit */ | |||
static const ucd_property UCD_PROPERTY_OTHER_ALPHABETIC = 0x0000000000000400ull; /**< @brief Other_Alphabetic */ | |||
static const ucd_property UCD_PROPERTY_IDEOGRAPHIC = 0x0000000000000800ull; /**< @brief Ideographic */ | |||
static const ucd_property UCD_PROPERTY_DIACRITIC = 0x0000000000001000ull; /**< @brief Diacritic */ | |||
static const ucd_property UCD_PROPERTY_EXTENDER = 0x0000000000002000ull; /**< @brief Extender */ | |||
static const ucd_property UCD_PROPERTY_OTHER_LOWERCASE = 0x0000000000004000ull; /**< @brief Other_Lowercase */ | |||
static const ucd_property UCD_PROPERTY_OTHER_UPPERCASE = 0x0000000000008000ull; /**< @brief Other_Uppercase */ | |||
static const ucd_property UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x0000000000010000ull; /**< @brief Noncharacter_Code_Point */ | |||
static const ucd_property UCD_PROPERTY_OTHER_GRAPHEME_EXTEND = 0x0000000000020000ull; /**< @brief Other_Grapheme_Extend */ | |||
static const ucd_property UCD_PROPERTY_IDS_BINARY_OPERATOR = 0x0000000000040000ull; /**< @brief IDS_Binary_Operator */ | |||
static const ucd_property UCD_PROPERTY_IDS_TRINARY_OPERATOR = 0x0000000000080000ull; /**< @brief IDS_Trinary_Operator */ | |||
static const ucd_property UCD_PROPERTY_RADICAL = 0x0000000000100000ull; /**< @brief Radical */ | |||
static const ucd_property UCD_PROPERTY_UNIFIED_IDEOGRAPH = 0x0000000000200000ull; /**< @brief Unified_Ideograph */ | |||
static const ucd_property UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT = 0x0000000000400000ull; /**< @brief Other_Default_Ignorable_Code_Point */ | |||
static const ucd_property UCD_PROPERTY_DEPRECATED = 0x0000000000800000ull; /**< @brief Deprecated */ | |||
static const ucd_property UCD_PROPERTY_SOFT_DOTTED = 0x0000000001000000ull; /**< @brief Soft_Dotted */ | |||
static const ucd_property UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION = 0x0000000002000000ull; /**< @brief Logical_Order_Exception */ | |||
static const ucd_property UCD_PROPERTY_OTHER_ID_START = 0x0000000004000000ull; /**< @brief Other_ID_Start */ | |||
static const ucd_property UCD_PROPERTY_OTHER_ID_CONTINUE = 0x0000000008000000ull; /**< @brief Other_ID_Continue */ | |||
static const ucd_property UCD_PROPERTY_SENTENCE_TERMINAL = 0x0000000010000000ull; /**< @brief Sentence_Terminal */ | |||
static const ucd_property UCD_PROPERTY_VARIATION_SELECTOR = 0x0000000020000000ull; /**< @brief Variation_Selector */ | |||
static const ucd_property UCD_PROPERTY_PATTERN_WHITE_SPACE = 0x0000000040000000ull; /**< @brief Pattern_White_Space */ | |||
static const ucd_property UCD_PROPERTY_PATTERN_SYNTAX = 0x0000000080000000ull; /**< @brief Pattern_Syntax */ | |||
static const ucd_property UCD_PROPERTY_PREPENDED_CONCATENATION_MARK = 0x0000000100000000ull; /**< @brief Prepended_Concatenation_Mark */ | |||
/** @brief Return the properties of the specified codepoint. | |||
* | |||
* @param c The Unicode codepoint to lookup. | |||
* @param category The General Category of the codepoint. | |||
* @return The properties associated with the codepoint. | |||
*/ | |||
ucd_property ucd_properties(codepoint_t c, ucd_category category); | |||
/** @brief Is the codepoint in the 'alnum' class? | |||
* | |||
* @param c The Unicode codepoint to check. | |||
@@ -770,6 +815,57 @@ namespace ucd | |||
return (script)ucd_lookup_script(c); | |||
} | |||
/** @brief Properties | |||
*/ | |||
typedef ucd_property property; | |||
enum | |||
{ | |||
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space */ | |||
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control */ | |||
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control */ | |||
Dash = UCD_PROPERTY_DASH, /**< @brief Dash */ | |||
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen */ | |||
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark */ | |||
Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation */ | |||
Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math */ | |||
Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit */ | |||
ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit */ | |||
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic */ | |||
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic */ | |||
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic */ | |||
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender */ | |||
Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase */ | |||
Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase */ | |||
Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point */ | |||
Other_Grapheme_Extend = UCD_PROPERTY_OTHER_GRAPHEME_EXTEND, /**< @brief Other_Grapheme_Extend */ | |||
IDS_Binary_Operator = UCD_PROPERTY_IDS_BINARY_OPERATOR, /**< @brief IDS_Binary_Operator */ | |||
IDS_Trinary_Operator = UCD_PROPERTY_IDS_TRINARY_OPERATOR, /**< @brief IDS_Trinary_Operator */ | |||
Radical = UCD_PROPERTY_RADICAL, /**< @brief Radical */ | |||
Unified_Ideograph = UCD_PROPERTY_UNIFIED_IDEOGRAPH, /**< @brief Unified_Ideograph */ | |||
Other_Default_Ignorable_Code_Point = UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT, /**< @brief Other_Default_Ignorable_Code_Point */ | |||
Deprecated = UCD_PROPERTY_DEPRECATED, /**< @brief Deprecated */ | |||
Soft_Dotted = UCD_PROPERTY_SOFT_DOTTED, /**< @brief Soft_Dotted */ | |||
Logical_Order_Exception = UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION, /**< @brief Logical_Order_Exception */ | |||
Other_ID_Start = UCD_PROPERTY_OTHER_ID_START, /**< @brief Other_ID_Start */ | |||
Other_ID_Continue = UCD_PROPERTY_OTHER_ID_CONTINUE, /**< @brief Other_ID_Continue */ | |||
Sentence_Terminal = UCD_PROPERTY_SENTENCE_TERMINAL, /**< @brief Sentence_Terminal */ | |||
Variation_Selector = UCD_PROPERTY_VARIATION_SELECTOR, /**< @brief Variation_Selector */ | |||
Pattern_White_Space = UCD_PROPERTY_PATTERN_WHITE_SPACE, /**< @brief Pattern_White_Space */ | |||
Pattern_Syntax = UCD_PROPERTY_PATTERN_SYNTAX, /**< @brief Pattern_Syntax */ | |||
Prepended_Concatenation_Mark = UCD_PROPERTY_PREPENDED_CONCATENATION_MARK, /**< @brief Prepended_Concatenation_Mark */ | |||
}; | |||
/** @brief Return the properties of the specified codepoint. | |||
* | |||
* @param c The Unicode codepoint to lookup. | |||
* @param cat The General Category of the codepoint. | |||
* @return The properties associated with the codepoint. | |||
*/ | |||
inline property properties(codepoint_t c, category cat) | |||
{ | |||
return (property)ucd_properties(c, (ucd_category)cat); | |||
} | |||
/** @brief Is the codepoint in the 'alnum' class? | |||
* | |||
* @param c The Unicode codepoint to check. |
@@ -163,6 +163,9 @@ void uprintf(FILE *out, codepoint_t c, const char *format) | |||
case 'p': // codepoint | |||
uprintf_codepoint(out, c, *++format); | |||
break; | |||
case 'P': // properties | |||
fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c))); | |||
break; | |||
case 'i': // is* | |||
uprintf_is(out, c, *++format); | |||
break; | |||
@@ -249,7 +252,7 @@ int main(int argc, char **argv) | |||
{ | |||
for (codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
uprintf(stdout, c, format ? format : | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n"); | |||
} | |||
return 0; | |||
} |
@@ -160,6 +160,9 @@ void uprintf(FILE *out, codepoint_t c, const char *format) | |||
case 'p': // codepoint | |||
uprintf_codepoint(out, c, *++format); | |||
break; | |||
case 'P': // properties | |||
fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c))); | |||
break; | |||
case 'i': // is* | |||
uprintf_is(out, c, *++format); | |||
break; | |||
@@ -244,7 +247,7 @@ int main(int argc, char **argv) | |||
{ | |||
for (codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
uprintf(stdout, c, format ? format : | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n"); | |||
} | |||
return 0; | |||
} |
@@ -160,6 +160,9 @@ void uprintf(FILE *out, ucd::codepoint_t c, const char *format) | |||
case 'p': // codepoint | |||
uprintf_codepoint(out, c, *++format); | |||
break; | |||
case 'P': // properties | |||
fprintf(out, "%016llx", ucd::properties(c, ucd::lookup_category(c))); | |||
break; | |||
case 'i': // is* | |||
uprintf_is(out, c, *++format); | |||
break; | |||
@@ -244,7 +247,7 @@ int main(int argc, char **argv) | |||
{ | |||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
uprintf(stdout, c, format ? format : | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n"); | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,39 @@ | |||
#!/usr/bin/python3 | |||
# Copyright (C) 2017 Reece H. Dunn | |||
# | |||
# This file is part of ucd-tools. | |||
# | |||
# ucd-tools is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 3 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# ucd-tools is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# You should have received a copy of the GNU General Public License | |||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
import os | |||
import sys | |||
def readtable(filename): | |||
with open(filename) as f: | |||
for line in f: | |||
if line == '\n' or line.startswith('#'): | |||
continue | |||
yield line.split()[0:2] | |||
table = {} | |||
for code, mapped in readtable(sys.argv[1]): | |||
table[int(code, 0)] = mapped.lower() | |||
for code in range(0x80, 0x100): | |||
if (code % 8 == 0): | |||
print('\t', end='') | |||
print('%s, ' % table.get(code, '0xfffd'), end='') | |||
if (code % 8 == 7): | |||
print('// %02x\n' % (code - 7), end='') |
@@ -113,6 +113,49 @@ def islower(data): | |||
else: | |||
return 0 | |||
def decomposition_type(data, dtype): | |||
value = data.get('DecompositionType', None) | |||
if value and value.startswith(dtype): | |||
return value | |||
return None | |||
def properties(data): | |||
props = 0 | |||
props += (2 ** 0) * data.get('White_Space', 0) | |||
props += (2 ** 1) * data.get('Bidi_Control', 0) | |||
props += (2 ** 2) * data.get('Join_Control', 0) | |||
props += (2 ** 3) * data.get('Dash', 0) | |||
props += (2 ** 4) * data.get('Hyphen', 0) | |||
props += (2 ** 5) * data.get('Quotation_Mark', 0) | |||
props += (2 ** 6) * data.get('Terminal_Punctuation', 0) | |||
props += (2 ** 7) * data.get('Other_Math', 0) | |||
props += (2 ** 8) * data.get('Hex_Digit', 0) | |||
props += (2 ** 9) * data.get('ASCII_Hex_Digit', 0) | |||
props += (2 ** 10) * data.get('Other_Alphabetic', 0) | |||
props += (2 ** 11) * data.get('Ideographic', 0) | |||
props += (2 ** 12) * data.get('Diacritic', 0) | |||
props += (2 ** 13) * data.get('Extender', 0) | |||
props += (2 ** 14) * data.get('Other_Lowercase', 0) | |||
props += (2 ** 15) * data.get('Other_Uppercase', 0) | |||
props += (2 ** 16) * data.get('Noncharacter_Code_Point', 0) | |||
props += (2 ** 17) * data.get('Other_Grapheme_Extend', 0) | |||
props += (2 ** 18) * data.get('IDS_Binary_Operator', 0) | |||
props += (2 ** 19) * data.get('IDS_Trinary_Operator', 0) | |||
props += (2 ** 20) * data.get('Radical', 0) | |||
props += (2 ** 21) * data.get('Unified_Ideograph', 0) | |||
props += (2 ** 22) * data.get('Other_Default_Ignorable_Code_Point', 0) | |||
props += (2 ** 23) * data.get('Deprecated', 0) | |||
props += (2 ** 24) * data.get('Soft_Dotted', 0) | |||
props += (2 ** 25) * data.get('Logical_Order_Exception', 0) | |||
props += (2 ** 26) * data.get('Other_ID_Start', 0) | |||
props += (2 ** 27) * data.get('Other_ID_Continue', 0) | |||
props += (2 ** 28) * data.get('Sentence_Terminal', 0) | |||
props += (2 ** 29) * data.get('Variation_Selector', 0) | |||
props += (2 ** 30) * data.get('Pattern_White_Space', 0) | |||
props += (2 ** 31) * data.get('Pattern_Syntax', 0) | |||
props += (2 ** 32) * data.get('Prepended_Concatenation_Mark', 0) | |||
return props | |||
if __name__ == '__main__': | |||
for codepoint in ucd.CodeRange('000000..10FFFF'): | |||
try: | |||
@@ -126,10 +169,11 @@ if __name__ == '__main__': | |||
if title == null: title = codepoint | |||
if upper == null: upper = codepoint | |||
if lower == null: lower = codepoint | |||
print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % ( | |||
print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %016x' % ( | |||
codepoint, script, | |||
data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'), | |||
upper, lower, title, | |||
isdigit(data), isxdigit(data), | |||
iscntrl(data), isspace(data), isblank(data), ispunct(data), | |||
isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data))) | |||
isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data), | |||
properties(data))) |