data/ucd | data/ucd | ||||
src/libucd.la | src/libucd.la | ||||
tests/printcdata | |||||
tests/printucddata | tests/printucddata | ||||
tests/printucddata_cpp | tests/printucddata_cpp | ||||
# Change Log | # Change Log | ||||
## 9.0.0.1 - (In Progress) | |||||
* Add `iswblank` and `iswxdigit` compatibility. | |||||
* Improve ctype compatibility. | |||||
## 9.0.0 - 2016-12-28 | ## 9.0.0 - 2016-12-28 | ||||
* Update to Unicode Character Data 9.0.0. | * Update to Unicode Character Data 9.0.0. |
UCD_VERSION=@UCD_VERSION@ | UCD_VERSION=@UCD_VERSION@ | ||||
UCD_ROOTDIR=data/ucd | UCD_ROOTDIR=data/ucd | ||||
UCD_SRCDIR=http://www.unicode.org/Public | |||||
data/ucd/PropList.txt: | data/ucd/PropList.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | |||||
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropList.txt > $@ | |||||
data/ucd/DerivedCoreProperties.txt: | |||||
mkdir -pv data/ucd | |||||
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/DerivedCoreProperties.txt > $@ | |||||
data/ucd/PropertyValueAliases.txt: | data/ucd/PropertyValueAliases.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropertyValueAliases.txt | |||||
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropertyValueAliases.txt > $@ | |||||
data/ucd/Scripts.txt: | data/ucd/Scripts.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt | |||||
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/Scripts.txt > $@ | |||||
data/ucd/UnicodeData.txt: | data/ucd/UnicodeData.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | |||||
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/UnicodeData.txt > $@ | |||||
############################# documentation ################################### | ############################# documentation ################################### | ||||
############################# tests ########################################### | ############################# tests ########################################### | ||||
noinst_bin_PROGRAMS += tests/printcdata | |||||
tests_printcdata_SOURCES = tests/printcdata.c | |||||
tests_printcdata_LDADD = src/libucd.la | |||||
noinst_bin_PROGRAMS += tests/printucddata | noinst_bin_PROGRAMS += tests/printucddata | ||||
tests_printucddata_SOURCES = tests/printucddata.c | tests_printucddata_SOURCES = tests/printucddata.c | ||||
tests_printucddata_LDADD = src/libucd.la | tests_printucddata_LDADD = src/libucd.la | ||||
tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | ||||
data/ucd/UnicodeData.txt \ | data/ucd/UnicodeData.txt \ | ||||
data/ucd/PropList.txt \ | data/ucd/PropList.txt \ | ||||
data/ucd/DerivedCoreProperties.txt \ | |||||
data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@ | tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@ | ||||
The following character classification functions are provided: | The following character classification functions are provided: | ||||
| C API | C++ API | | |||||
|---------------|----------------| | |||||
| `ucd_isalnum` | `ucd::isalnum` | | |||||
| `ucd_isalpha` | `ucd::isalpha` | | |||||
| `ucd_iscntrl` | `ucd::iscntrl` | | |||||
| `ucd_isdigit` | `ucd::isdigit` | | |||||
| `ucd_isgraph` | `ucd::isgraph` | | |||||
| `ucd_islower` | `ucd::islower` | | |||||
| `ucd_isprint` | `ucd::isprint` | | |||||
| `ucd_ispunct` | `ucd::ispunct` | | |||||
| `ucd_isspace` | `ucd::isspace` | | |||||
| `ucd_isupper` | `ucd::isupper` | | |||||
__NOTE:__ Equivalents for `isblank` and `isxdigit` are not provided. | |||||
| C API | C++ API | | |||||
|----------------|-----------------| | |||||
| `ucd_isalnum` | `ucd::isalnum` | | |||||
| `ucd_isalpha` | `ucd::isalpha` | | |||||
| `ucd_isblank` | `ucd::isblank` | | |||||
| `ucd_iscntrl` | `ucd::iscntrl` | | |||||
| `ucd_isdigit` | `ucd::isdigit` | | |||||
| `ucd_isgraph` | `ucd::isgraph` | | |||||
| `ucd_islower` | `ucd::islower` | | |||||
| `ucd_isprint` | `ucd::isprint` | | |||||
| `ucd_ispunct` | `ucd::ispunct` | | |||||
| `ucd_isspace` | `ucd::isspace` | | |||||
| `ucd_isupper` | `ucd::isupper` | | |||||
| `ucd_isxdigit` | `ucd::isxdigit` | | |||||
## Build Dependencies | ## Build Dependencies | ||||
/* ctype-style APIs. | /* ctype-style APIs. | ||||
* | * | ||||
* Copyright (C) 2012-2016 Reece H. Dunn | |||||
* Copyright (C) 2012-2017 Reece H. Dunn | |||||
* | * | ||||
* This file is part of ucd-tools. | * This file is part of ucd-tools. | ||||
* | * | ||||
#include "ucd/ucd.h" | #include "ucd/ucd.h" | ||||
static int other_alphabetic_MnMcSo(codepoint_t c) | |||||
{ | |||||
switch (c & 0xFFFFFF00) | |||||
{ | |||||
case 0x0300: | |||||
return c == 0x0345; | |||||
case 0x0500: | |||||
return (c >= 0x05B0 && c <= 0x05BD) | |||||
|| c == 0x05BF | |||||
|| (c >= 0x05C1 && c <= 0x05C2) | |||||
|| (c >= 0x05C4 && c <= 0x05C5) | |||||
|| c == 0x05C7; | |||||
case 0x0600: | |||||
return (c >= 0x0610 && c <= 0x061A) | |||||
|| (c >= 0x064B && c <= 0x0657) | |||||
|| (c >= 0x0659 && c <= 0x065F) | |||||
|| c == 0x0670 | |||||
|| (c >= 0x06D6 && c <= 0x06DC) | |||||
|| (c >= 0x06E1 && c <= 0x06E4) | |||||
|| (c >= 0x06E7 && c <= 0x06E8) | |||||
|| c == 0x06ED; | |||||
case 0x0700: | |||||
return c == 0x0711 | |||||
|| (c >= 0x0730 && c <= 0x073F) | |||||
|| (c >= 0x07A6 && c <= 0x07B0); | |||||
case 0x0800: | |||||
return (c >= 0x0816 && c <= 0x0817) | |||||
|| (c >= 0x081B && c <= 0x0823) | |||||
|| (c >= 0x0825 && c <= 0x0827) | |||||
|| (c >= 0x0829 && c <= 0x082C) | |||||
|| (c >= 0x08D4 && c <= 0x08DF) | |||||
|| (c >= 0x08E3 && c <= 0x08E9) | |||||
|| c >= 0x08F0; | |||||
case 0x0900: | |||||
return (c >= 0x0900 && c <= 0x0903) | |||||
|| (c >= 0x093A && c <= 0x093B) | |||||
|| (c >= 0x093E && c <= 0x094C) | |||||
|| (c >= 0x094E && c <= 0x094F) | |||||
|| (c >= 0x0955 && c <= 0x0957) | |||||
|| (c >= 0x0962 && c <= 0x0963) | |||||
|| (c >= 0x0981 && c <= 0x0983) | |||||
|| (c >= 0x0981 && c <= 0x0983) | |||||
|| (c >= 0x09BE && c <= 0x09C4) | |||||
|| (c >= 0x09C7 && c <= 0x09C8) | |||||
|| (c >= 0x09CB && c <= 0x09CC) | |||||
|| c == 0x09D7 | |||||
|| (c >= 0x09E2 && c <= 0x09E3); | |||||
case 0x0A00: | |||||
return (c >= 0x0A01 && c <= 0x0A03) | |||||
|| (c >= 0x0A3E && c <= 0x0A42) | |||||
|| (c >= 0x0A47 && c <= 0x0A48) | |||||
|| (c >= 0x0A4B && c <= 0x0A4C) | |||||
|| c == 0x0A51 | |||||
|| (c >= 0x0A70 && c <= 0x0A71) | |||||
|| c == 0x0A75 | |||||
|| (c >= 0x0A81 && c <= 0x0A83) | |||||
|| (c >= 0x0ABE && c <= 0x0AC5) | |||||
|| (c >= 0x0AC7 && c <= 0x0AC9) | |||||
|| (c >= 0x0ACB && c <= 0x0ACC) | |||||
|| (c >= 0x0AE2 && c <= 0x0AE3); | |||||
case 0x0B00: | |||||
return (c >= 0x0B01 && c <= 0x0B03) | |||||
|| (c >= 0x0B3E && c <= 0x0B44) | |||||
|| (c >= 0x0B47 && c <= 0x0B48) | |||||
|| (c >= 0x0B4B && c <= 0x0B4C) | |||||
|| (c >= 0x0B56 && c <= 0x0B57) | |||||
|| (c >= 0x0B62 && c <= 0x0B63) | |||||
|| c == 0x0B82 | |||||
|| (c >= 0x0BBE && c <= 0x0BC2) | |||||
|| (c >= 0x0BC6 && c <= 0x0BC8) | |||||
|| (c >= 0x0BCA && c <= 0x0BCC) | |||||
|| c == 0x0BD7; | |||||
case 0x0C00: | |||||
return (c >= 0x0C00 && c <= 0x0C03) | |||||
|| (c >= 0x0C3E && c <= 0x0C44) | |||||
|| (c >= 0x0C46 && c <= 0x0C48) | |||||
|| (c >= 0x0C4A && c <= 0x0C4C) | |||||
|| (c >= 0x0C55 && c <= 0x0C56) | |||||
|| (c >= 0x0C62 && c <= 0x0C63) | |||||
|| (c >= 0x0C81 && c <= 0x0C83) | |||||
|| (c >= 0x0CBE && c <= 0x0CBF) | |||||
|| (c >= 0x0CC0 && c <= 0x0CC4) | |||||
|| (c >= 0x0CC6 && c <= 0x0CC8) | |||||
|| (c >= 0x0CCA && c <= 0x0CCC) | |||||
|| (c >= 0x0CD5 && c <= 0x0CD6) | |||||
|| (c >= 0x0CE2 && c <= 0x0CE3); | |||||
case 0x0D00: | |||||
return (c >= 0x0D01 && c <= 0x0D03) | |||||
|| (c >= 0x0D3E && c <= 0x0D44) | |||||
|| (c >= 0x0D46 && c <= 0x0D48) | |||||
|| (c >= 0x0D4A && c <= 0x0D4C) | |||||
|| c == 0x0D57 | |||||
|| (c >= 0x0D62 && c <= 0x0D63) | |||||
|| (c >= 0x0D82 && c <= 0x0D83) | |||||
|| (c >= 0x0DCF && c <= 0x0DD4) | |||||
|| c == 0x0DD6 | |||||
|| (c >= 0x0DD8 && c <= 0x0DDF) | |||||
|| (c >= 0x0DF2 && c <= 0x0DF3); | |||||
case 0x0E00: | |||||
return c == 0x0E31 | |||||
|| (c >= 0x0E34 && c <= 0x0E3A) | |||||
|| c == 0x0E4D | |||||
|| c == 0x0EB1 | |||||
|| (c >= 0x0EB4 && c <= 0x0EB9) | |||||
|| (c >= 0x0EBB && c <= 0x0EBD) | |||||
|| c == 0x0ECD; | |||||
case 0x0F00: | |||||
return (c >= 0x0F71 && c <= 0x0F7F) | |||||
|| (c >= 0x0F80 && c <= 0x0F81) | |||||
|| (c >= 0x0F8D && c <= 0x0F97) | |||||
|| (c >= 0x0F99 && c <= 0x0FBC); | |||||
case 0x1000: | |||||
return (c >= 0x102B && c <= 0x1036) | |||||
|| c == 0x1038 | |||||
|| (c >= 0x103B && c <= 0x103E) | |||||
|| (c >= 0x1056 && c <= 0x1059) | |||||
|| (c >= 0x105E && c <= 0x1060) | |||||
|| c == 0x1062 | |||||
|| (c >= 0x1067 && c <= 0x1068) | |||||
|| (c >= 0x1071 && c <= 0x1074) | |||||
|| (c >= 0x1082 && c <= 0x1086) | |||||
|| (c >= 0x109C && c <= 0x109D); | |||||
case 0x1300: | |||||
return c == 0x135F; | |||||
case 0x1700: | |||||
return (c >= 0x1712 && c <= 0x1713) | |||||
|| (c >= 0x1732 && c <= 0x1733) | |||||
|| (c >= 0x1752 && c <= 0x1753) | |||||
|| (c >= 0x1772 && c <= 0x1773) | |||||
|| (c >= 0x17B6 && c <= 0x17C8); | |||||
case 0x1800: | |||||
return (c >= 0x1885 && c <= 0x1886) | |||||
|| c == 0x18A9; | |||||
case 0x1900: | |||||
return (c >= 0x1920 && c <= 0x192B) | |||||
|| (c >= 0x1930 && c <= 0x1938); | |||||
case 0x1A00: | |||||
return (c >= 0x1A17 && c <= 0x1A1B) | |||||
|| (c >= 0x1A55 && c <= 0x1A5E) | |||||
|| (c >= 0x1A61 && c <= 0x1A74); | |||||
case 0x1B00: | |||||
return (c >= 0x1B00 && c <= 0x1B04) | |||||
|| (c >= 0x1B35 && c <= 0x1B43) | |||||
|| (c >= 0x1B80 && c <= 0x1B82) | |||||
|| (c >= 0x1BA1 && c <= 0x1BA9) | |||||
|| (c >= 0x1BAC && c <= 0x1BAD) | |||||
|| (c >= 0x1BE7 && c <= 0x1BF1); | |||||
case 0x1C00: | |||||
return (c >= 0x1C24 && c <= 0x1C35) | |||||
|| (c >= 0x1CF2 && c <= 0x1CF3); | |||||
case 0x1D00: | |||||
return (c >= 0x1DE7 && c <= 0x1DF4); | |||||
case 0x2400: | |||||
return (c >= 0x24B6 && c <= 0x24E9); | |||||
case 0x2D00: | |||||
return (c >= 0x2DE0 && c <= 0x2DFF); | |||||
case 0xA600: | |||||
return (c >= 0xA674 && c <= 0xA67B) | |||||
|| (c >= 0xA69E && c <= 0xA69F); | |||||
case 0xA800: | |||||
return (c >= 0xA823 && c <= 0xA827) | |||||
|| (c >= 0xA880 && c <= 0xA881) | |||||
|| (c >= 0xA8B4 && c <= 0xA8C3) | |||||
|| c == 0xA8C5; | |||||
case 0xA900: | |||||
return (c >= 0xA926 && c <= 0xA92A) | |||||
|| (c >= 0xA947 && c <= 0xA952) | |||||
|| (c >= 0xA980 && c <= 0xA983) | |||||
|| (c >= 0xA9B4 && c <= 0xA9BF); | |||||
case 0xAA00: | |||||
return (c >= 0xAA29 && c <= 0xAA36) | |||||
|| c == 0xAA43 | |||||
|| (c >= 0xAA4C && c <= 0xAA4D) | |||||
|| c == 0xAAB0 | |||||
|| (c >= 0xAAB2 && c <= 0xAAB4) | |||||
|| (c >= 0xAAB7 && c <= 0xAAB8) | |||||
|| c == 0xAABE | |||||
|| (c >= 0xAAEB && c <= 0xAAEF) | |||||
|| c == 0xAAF5; | |||||
case 0xAB00: | |||||
return (c >= 0xABE3 && c <= 0xABEA); | |||||
case 0xFB00: | |||||
return c == 0xFB1E; | |||||
case 0x10300: | |||||
return (c >= 0x10376 && c <= 0x1037A); | |||||
case 0x10A00: | |||||
return (c >= 0x10A01 && c <= 0x10A03) | |||||
|| (c >= 0x10A05 && c <= 0x10A06) | |||||
|| (c >= 0x10A0C && c <= 0x10A0F); | |||||
case 0x11000: | |||||
return (c >= 0x11000 && c <= 0x11002) | |||||
|| (c >= 0x11038 && c <= 0x11045) | |||||
|| c == 0x11082 | |||||
|| (c >= 0x110B0 && c <= 0x110B8); | |||||
case 0x11100: | |||||
return (c >= 0x11100 && c <= 0x11102) | |||||
|| (c >= 0x11127 && c <= 0x11132) | |||||
|| (c >= 0x11180 && c <= 0x11182) | |||||
|| (c >= 0x111B3 && c <= 0x111BF); | |||||
case 0x11200: | |||||
return (c >= 0x1122C && c <= 0x11234) | |||||
|| c == 0x11237 | |||||
|| c == 0x1123E | |||||
|| (c >= 0x112DF && c <= 0x112E8); | |||||
case 0x11300: | |||||
return (c >= 0x11300 && c <= 0x11303) | |||||
|| (c >= 0x1133E && c <= 0x11344) | |||||
|| (c >= 0x11347 && c <= 0x11348) | |||||
|| (c >= 0x1134B && c <= 0x1134C) | |||||
|| c == 0x11357 | |||||
|| (c >= 0x11362 && c <= 0x11363); | |||||
case 0x11400: | |||||
return (c >= 0x11435 && c <= 0x11441) | |||||
|| (c >= 0x11443 && c <= 0x11445) | |||||
|| (c >= 0x114B0 && c <= 0x114C1); | |||||
case 0x11500: | |||||
return (c >= 0x115AF && c <= 0x115B5) | |||||
|| (c >= 0x115B8 && c <= 0x115BE) | |||||
|| (c >= 0x115DC && c <= 0x115DD); | |||||
case 0x11600: | |||||
return (c >= 0x11630 && c <= 0x1163E) | |||||
|| c == 0x11640 | |||||
|| (c >= 0x116AB && c <= 0x116B5); | |||||
case 0x11700: | |||||
return (c >= 0x1171D && c <= 0x1172A); | |||||
case 0x11C00: | |||||
return (c >= 0x11C2F && c <= 0x11C36) | |||||
|| (c >= 0x11C38 && c <= 0x11C3E) | |||||
|| (c >= 0x11C92 && c <= 0x11CA7) | |||||
|| (c >= 0x11CA9 && c <= 0x11CB6); | |||||
case 0x16B00: | |||||
return (c >= 0x16B30 && c <= 0x16B36); | |||||
case 0x16F00: | |||||
return (c >= 0x16F51 && c <= 0x16F7E); | |||||
case 0x1BC00: | |||||
return c == 0x1BC9E; | |||||
case 0x1E000: | |||||
return (c >= 0x1E000 && c <= 0x1E006) | |||||
|| (c >= 0x1E008 && c <= 0x1E018) | |||||
|| (c >= 0x1E01B && c <= 0x1E021) | |||||
|| (c >= 0x1E023 && c <= 0x1E024) | |||||
|| (c >= 0x1E026 && c <= 0x1E02A); | |||||
case 0x1E900: | |||||
return c == 0x1E947; | |||||
case 0x1F100: | |||||
return (c >= 0x01F130 && c <= 0x01F149) | |||||
|| (c >= 0x01F150 && c <= 0x01F169) | |||||
|| (c >= 0x01F170 && c <= 0x01F189); | |||||
default: | |||||
return 0; | |||||
} | |||||
} | |||||
int ucd_isalnum(codepoint_t c) | int ucd_isalnum(codepoint_t c) | ||||
{ | { | ||||
switch (ucd_lookup_category(c)) | switch (ucd_lookup_category(c)) | ||||
{ | { | ||||
case UCD_CATEGORY_Lu: | |||||
case UCD_CATEGORY_Ll: | case UCD_CATEGORY_Ll: | ||||
case UCD_CATEGORY_Lt: | |||||
case UCD_CATEGORY_Lm: | case UCD_CATEGORY_Lm: | ||||
case UCD_CATEGORY_Lo: | case UCD_CATEGORY_Lo: | ||||
case UCD_CATEGORY_Lt: | |||||
case UCD_CATEGORY_Lu: | |||||
case UCD_CATEGORY_Nd: | |||||
case UCD_CATEGORY_Nl: | case UCD_CATEGORY_Nl: | ||||
case UCD_CATEGORY_Nd: | |||||
case UCD_CATEGORY_No: | case UCD_CATEGORY_No: | ||||
return 1; | return 1; | ||||
case UCD_CATEGORY_Mn: | |||||
case UCD_CATEGORY_Mc: | |||||
case UCD_CATEGORY_So: | |||||
return other_alphabetic_MnMcSo(c); | |||||
default: | default: | ||||
return 0; | return 0; | ||||
} | } | ||||
{ | { | ||||
switch (ucd_lookup_category(c)) | switch (ucd_lookup_category(c)) | ||||
{ | { | ||||
case UCD_CATEGORY_Lu: | |||||
case UCD_CATEGORY_Ll: | case UCD_CATEGORY_Ll: | ||||
case UCD_CATEGORY_Lt: | |||||
case UCD_CATEGORY_Lm: | case UCD_CATEGORY_Lm: | ||||
case UCD_CATEGORY_Lo: | case UCD_CATEGORY_Lo: | ||||
case UCD_CATEGORY_Lt: | |||||
case UCD_CATEGORY_Lu: | |||||
case UCD_CATEGORY_Nl: | |||||
return 1; | return 1; | ||||
case UCD_CATEGORY_Mn: | |||||
case UCD_CATEGORY_Mc: | |||||
case UCD_CATEGORY_So: | |||||
return other_alphabetic_MnMcSo(c); | |||||
default: | default: | ||||
return 0; | return 0; | ||||
} | } | ||||
} | } | ||||
int ucd_iscntrl(codepoint_t c) | |||||
{ | |||||
return ucd_lookup_category(c) == UCD_CATEGORY_Cc; | |||||
} | |||||
int ucd_isdigit(codepoint_t c) | |||||
int ucd_isblank(codepoint_t c) | |||||
{ | { | ||||
switch (ucd_lookup_category(c)) | switch (ucd_lookup_category(c)) | ||||
{ | { | ||||
case UCD_CATEGORY_Nd: | |||||
case UCD_CATEGORY_Nl: | |||||
case UCD_CATEGORY_No: | |||||
case UCD_CATEGORY_Zs: | |||||
switch (c) // Exclude characters with the <noBreak> DispositionType | |||||
{ | |||||
case 0x00A0: // U+00A0 : NO-BREAK SPACE | |||||
case 0x2007: // U+2007 : FIGURE SPACE | |||||
case 0x202F: // U+202F : NARROW NO-BREAK SPACE | |||||
return 0; | |||||
} | |||||
return 1; | return 1; | ||||
case UCD_CATEGORY_Cc: | |||||
return c == 0x09; // U+0009 : CHARACTER TABULATION | |||||
default: | default: | ||||
return 0; | return 0; | ||||
} | } | ||||
} | } | ||||
int ucd_iscntrl(codepoint_t c) | |||||
{ | |||||
return ucd_lookup_category(c) == UCD_CATEGORY_Cc; | |||||
} | |||||
int ucd_isdigit(codepoint_t c) | |||||
{ | |||||
return (c >= 0x30 && c <= 0x39); // [0-9] | |||||
} | |||||
int ucd_isgraph(codepoint_t c) | int ucd_isgraph(codepoint_t c) | ||||
{ | { | ||||
switch (ucd_lookup_category(c)) | switch (ucd_lookup_category(c)) | ||||
int ucd_islower(codepoint_t c) | int ucd_islower(codepoint_t c) | ||||
{ | { | ||||
return ucd_lookup_category(c) == UCD_CATEGORY_Ll; | |||||
switch (ucd_lookup_category(c)) | |||||
{ | |||||
case UCD_CATEGORY_Ll: | |||||
return 1; | |||||
case UCD_CATEGORY_Lt: | |||||
return ucd_toupper(c) != c; | |||||
case UCD_CATEGORY_Lo: | |||||
return c == 0xAA // Other_Lowercase : FEMININE ORDINAL INDICATOR | |||||
|| c == 0xBA; // Other_Lowercase : MASCULINE ORDINAL INDICATOR | |||||
case UCD_CATEGORY_Lm: | |||||
return (c >= 0x02B0 && c <= 0x02B8) // Other_Lowercase | |||||
|| (c >= 0x02C0 && c <= 0x02C1) // Other_Lowercase | |||||
|| (c >= 0x02E0 && c <= 0x02E4) // Other_Lowercase | |||||
|| c == 0x037A // Other_Lowercase | |||||
|| (c >= 0x1D2C && c <= 0x1D6A) // Other_Lowercase | |||||
|| c == 0x1D78 // Other_Lowercase | |||||
|| (c >= 0x1D9B && c <= 0x1DBF) // Other_Lowercase | |||||
|| c == 0x2071 // Other_Lowercase | |||||
|| c == 0x207F // Other_Lowercase | |||||
|| (c >= 0x2090 && c <= 0x209C) // Other_Lowercase | |||||
|| (c >= 0x2C7C && c <= 0x2C7D) // Other_Lowercase | |||||
|| (c >= 0xA69C && c <= 0xA69D) // Other_Lowercase | |||||
|| c == 0xA770 // Other_Lowercase | |||||
|| (c >= 0xA7F8 && c <= 0xA7F9) // Other_Lowercase | |||||
|| (c >= 0xAB5C && c <= 0xAB5F); // Other_Lowercase | |||||
case UCD_CATEGORY_Mn: | |||||
return c == 0x0345; // Other_Lowercase : COMBINING GREEK YPOGEGRAMMENI | |||||
case UCD_CATEGORY_Nl: | |||||
return (c >= 0x2170 && c <= 0x217F); // Other_Lowercase | |||||
case UCD_CATEGORY_So: | |||||
return (c >= 0x24D0 && c <= 0x24E9); // Other_Lowercase | |||||
default: | |||||
return 0; | |||||
} | |||||
} | } | ||||
int ucd_isprint(codepoint_t c) | int ucd_isprint(codepoint_t c) | ||||
{ | { | ||||
case UCD_CATEGORY_Zl: | case UCD_CATEGORY_Zl: | ||||
case UCD_CATEGORY_Zp: | case UCD_CATEGORY_Zp: | ||||
return 1; | |||||
case UCD_CATEGORY_Zs: | case UCD_CATEGORY_Zs: | ||||
switch (c) // Exclude characters with the <noBreak> DispositionType | |||||
{ | |||||
case 0x00A0: // U+00A0 : NO-BREAK SPACE | |||||
case 0x2007: // U+2007 : FIGURE SPACE | |||||
case 0x202F: // U+202F : NARROW NO-BREAK SPACE | |||||
return 0; | |||||
} | |||||
return 1; | return 1; | ||||
case UCD_CATEGORY_Cc: | case UCD_CATEGORY_Cc: | ||||
switch (c) // Some control characters are also whitespace characters: | |||||
switch (c) // Include control characters marked as White_Space | |||||
{ | { | ||||
case 0x09: // U+0009 : CHARACTER TABULATION | case 0x09: // U+0009 : CHARACTER TABULATION | ||||
case 0x0A: // U+000A : LINE FEED | case 0x0A: // U+000A : LINE FEED | ||||
int ucd_isupper(codepoint_t c) | int ucd_isupper(codepoint_t c) | ||||
{ | { | ||||
return ucd_lookup_category(c) == UCD_CATEGORY_Lu; | |||||
switch (ucd_lookup_category(c)) | |||||
{ | |||||
case UCD_CATEGORY_Lu: | |||||
return 1; | |||||
case UCD_CATEGORY_Lt: | |||||
return ucd_tolower(c) != c; | |||||
case UCD_CATEGORY_Nl: | |||||
return (c >= 0x002160 && c <= 0x00216F); // Other_Uppercase | |||||
case UCD_CATEGORY_So: | |||||
return (c >= 0x0024B6 && c <= 0x0024CF) // Other_Uppercase | |||||
|| (c >= 0x01F130 && c <= 0x01F149) // Other_Uppercase | |||||
|| (c >= 0x01F150 && c <= 0x01F169) // Other_Uppercase | |||||
|| (c >= 0x01F170 && c <= 0x01F189); // Other_Uppercase | |||||
default: | |||||
return 0; | |||||
} | |||||
} | |||||
int ucd_isxdigit(codepoint_t c) | |||||
{ | |||||
return (c >= 0x30 && c <= 0x39) // [0-9] | |||||
|| (c >= 0x41 && c <= 0x46) // [A-Z] | |||||
|| (c >= 0x61 && c <= 0x66); // [a-z] | |||||
} | } |
/* Unicode Character Database API | /* Unicode Character Database API | ||||
* | * | ||||
* Copyright (C) 2012-2016 Reece H. Dunn | |||||
* Copyright (C) 2012-2017 Reece H. Dunn | |||||
* | * | ||||
* This file is part of ucd-tools. | * This file is part of ucd-tools. | ||||
* | * | ||||
*/ | */ | ||||
ucd_script ucd_lookup_script(codepoint_t c); | ucd_script ucd_lookup_script(codepoint_t c); | ||||
/** @brief Is the codepoint an alpha-numeric character? | |||||
/** @brief Is the codepoint in the 'alnum' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a letter or number, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isalnum(codepoint_t c); | int ucd_isalnum(codepoint_t c); | ||||
/** @brief Is the codepoint a letter? | |||||
/** @brief Is the codepoint in the 'alpha' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isalpha(codepoint_t c); | int ucd_isalpha(codepoint_t c); | ||||
/** @brief Is the codepoint a control character? | |||||
/** @brief Is the codepoint in the 'blank' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a control character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'blank' class, zero otherwise. | |||||
*/ | |||||
int ucd_isblank(codepoint_t c); | |||||
/** @brief Is the codepoint in the 'cntrl' class? | |||||
* | |||||
* @param c The Unicode codepoint to check. | |||||
* @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_iscntrl(codepoint_t c); | int ucd_iscntrl(codepoint_t c); | ||||
/** @brief Is the codepoint a numeric character? | |||||
/** @brief Is the codepoint in the 'digit' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a number, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'digit' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isdigit(codepoint_t c); | int ucd_isdigit(codepoint_t c); | ||||
/** @brief Does the codepoint have a displayable glyph? | |||||
/** @brief Is the codepoint in the 'graph' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint has a displayable glyph, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'graph' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isgraph(codepoint_t c); | int ucd_isgraph(codepoint_t c); | ||||
/** @brief Is the codepoint a lower-case letter? | |||||
/** @brief Is the codepoint in the 'lower' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a lower-case letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'lower' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_islower(codepoint_t c); | int ucd_islower(codepoint_t c); | ||||
/** @brief Is the codepoint a printable character? | |||||
/** @brief Is the codepoint in the 'print' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a printable character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'print' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isprint(codepoint_t c); | int ucd_isprint(codepoint_t c); | ||||
/** @brief Is the codepoint a punctuation character? | |||||
/** @brief Is the codepoint in the 'punct' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a punctuation character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'punct' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_ispunct(codepoint_t c); | int ucd_ispunct(codepoint_t c); | ||||
/** @brief Is the codepoint a whitespace character? | |||||
/** @brief Is the codepoint in the 'space' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a whitespace character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'space' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isspace(codepoint_t c); | int ucd_isspace(codepoint_t c); | ||||
/** @brief Is the codepoint an upper-case letter? | |||||
/** @brief Is the codepoint in the 'upper' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is an upper-case letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'upper' class, zero otherwise. | |||||
*/ | */ | ||||
int ucd_isupper(codepoint_t c); | int ucd_isupper(codepoint_t c); | ||||
/** @brief Is the codepoint in the 'xdigit' class? | |||||
* | |||||
* @param c The Unicode codepoint to check. | |||||
* @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise. | |||||
*/ | |||||
int ucd_isxdigit(codepoint_t c); | |||||
/** @brief Convert the Unicode codepoint to upper-case. | /** @brief Convert the Unicode codepoint to upper-case. | ||||
* | * | ||||
* This function only uses the simple case mapping present in the | * This function only uses the simple case mapping present in the | ||||
return (script)ucd_lookup_script(c); | return (script)ucd_lookup_script(c); | ||||
} | } | ||||
/** @brief Is the codepoint an alpha-numeric character? | |||||
/** @brief Is the codepoint in the 'alnum' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a letter or number, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isalnum(codepoint_t c) | inline int isalnum(codepoint_t c) | ||||
{ | { | ||||
return ucd_isalnum(c); | return ucd_isalnum(c); | ||||
} | } | ||||
/** @brief Is the codepoint a letter? | |||||
/** @brief Is the codepoint in the 'alpha' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isalpha(codepoint_t c) | inline int isalpha(codepoint_t c) | ||||
{ | { | ||||
return ucd_isalpha(c); | return ucd_isalpha(c); | ||||
} | } | ||||
/** @brief Is the codepoint a control character? | |||||
/** @brief Is the codepoint in the 'blank' class? | |||||
* | |||||
* @param c The Unicode codepoint to check. | |||||
* @return Non-zero if the codepoint is in the 'blank' class, zero otherwise. | |||||
*/ | |||||
inline int isblank(codepoint_t c) | |||||
{ | |||||
return ucd_isblank(c); | |||||
} | |||||
/** @brief Is the codepoint in the 'cntrl' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a control character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise. | |||||
*/ | */ | ||||
inline int iscntrl(codepoint_t c) | inline int iscntrl(codepoint_t c) | ||||
{ | { | ||||
return ucd_iscntrl(c); | return ucd_iscntrl(c); | ||||
} | } | ||||
/** @brief Is the codepoint a numeric character? | |||||
/** @brief Is the codepoint in the 'digit' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a number, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'digit' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isdigit(codepoint_t c) | inline int isdigit(codepoint_t c) | ||||
{ | { | ||||
return ucd_isdigit(c); | return ucd_isdigit(c); | ||||
} | } | ||||
/** @brief Does the codepoint have a displayable glyph? | |||||
/** @brief Is the codepoint in the 'graph' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint has a displayable glyph, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'graph' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isgraph(codepoint_t c) | inline int isgraph(codepoint_t c) | ||||
{ | { | ||||
return ucd_isgraph(c); | return ucd_isgraph(c); | ||||
} | } | ||||
/** @brief Is the codepoint a lower-case letter? | |||||
/** @brief Is the codepoint in the 'lower' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a lower-case letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'lower' class, zero otherwise. | |||||
*/ | */ | ||||
inline int islower(codepoint_t c) | inline int islower(codepoint_t c) | ||||
{ | { | ||||
return ucd_islower(c); | return ucd_islower(c); | ||||
} | } | ||||
/** @brief Is the codepoint a printable character? | |||||
/** @brief Is the codepoint in the 'print' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a printable character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'print' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isprint(codepoint_t c) | inline int isprint(codepoint_t c) | ||||
{ | { | ||||
return ucd_isprint(c); | return ucd_isprint(c); | ||||
} | } | ||||
/** @brief Is the codepoint a punctuation character? | |||||
/** @brief Is the codepoint in the 'punct' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a punctuation character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'punct' class, zero otherwise. | |||||
*/ | */ | ||||
inline int ispunct(codepoint_t c) | inline int ispunct(codepoint_t c) | ||||
{ | { | ||||
return ucd_ispunct(c); | return ucd_ispunct(c); | ||||
} | } | ||||
/** @brief Is the codepoint a whitespace character? | |||||
/** @brief Is the codepoint in the 'space' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is a whitespace character, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'space' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isspace(codepoint_t c) | inline int isspace(codepoint_t c) | ||||
{ | { | ||||
return ucd_isspace(c); | return ucd_isspace(c); | ||||
} | } | ||||
/** @brief Is the codepoint an upper-case letter? | |||||
/** @brief Is the codepoint in the 'upper' class? | |||||
* | * | ||||
* @param c The Unicode codepoint to check. | * @param c The Unicode codepoint to check. | ||||
* @return Non-zero if the codepoint is an upper-case letter, zero otherwise. | |||||
* @return Non-zero if the codepoint is in the 'upper' class, zero otherwise. | |||||
*/ | */ | ||||
inline int isupper(codepoint_t c) | inline int isupper(codepoint_t c) | ||||
{ | { | ||||
return ucd_isupper(c); | return ucd_isupper(c); | ||||
} | } | ||||
/** @brief Is the codepoint in the 'xdigit' class? | |||||
* | |||||
* @param c The Unicode codepoint to check. | |||||
* @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise. | |||||
*/ | |||||
inline int isxdigit(codepoint_t c) | |||||
{ | |||||
return ucd_isxdigit(c); | |||||
} | |||||
/** @brief Convert the Unicode codepoint to upper-case. | /** @brief Convert the Unicode codepoint to upper-case. | ||||
* | * | ||||
* This function only uses the simple case mapping present in the | * This function only uses the simple case mapping present in the |
/* | |||||
* Copyright (C) 2012-2017 Reece H. Dunn | |||||
* | |||||
* This file is part of ucd-tools. | |||||
* | |||||
* ucd-tools is free software: you can redistribute it and/or modify | |||||
* it under the terms of the GNU General Public License as published by | |||||
* the Free Software Foundation, either version 3 of the License, or | |||||
* (at your option) any later version. | |||||
* | |||||
* ucd-tools is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
* GNU General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU General Public License | |||||
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
*/ | |||||
#include "ucd/ucd.h" | |||||
#include <locale.h> | |||||
#include <string.h> | |||||
#include <stdio.h> | |||||
#include <wchar.h> | |||||
#include <wctype.h> | |||||
void fput_utf8c(FILE *out, codepoint_t c) | |||||
{ | |||||
if (c < 0x80) | |||||
fputc((uint8_t)c, out); | |||||
else if (c < 0x800) | |||||
{ | |||||
fputc(0xC0 | (c >> 6), out); | |||||
fputc(0x80 + (c & 0x3F), out); | |||||
} | |||||
else if (c < 0x10000) | |||||
{ | |||||
fputc(0xE0 | (c >> 12), out); | |||||
fputc(0x80 + ((c >> 6) & 0x3F), out); | |||||
fputc(0x80 + (c & 0x3F), out); | |||||
} | |||||
else if (c < 0x200000) | |||||
{ | |||||
fputc(0xF0 | (c >> 18), out); | |||||
fputc(0x80 + ((c >> 12) & 0x3F), out); | |||||
fputc(0x80 + ((c >> 6) & 0x3F), out); | |||||
fputc(0x80 + (c & 0x3F), out); | |||||
} | |||||
} | |||||
int fget_utf8c(FILE *in, codepoint_t *c) | |||||
{ | |||||
int ch = EOF; | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
if ((uint8_t)ch < 0x80) | |||||
*c = (uint8_t)ch; | |||||
else switch ((uint8_t)ch & 0xF0) | |||||
{ | |||||
default: | |||||
*c = (uint8_t)ch & 0x1F; | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
break; | |||||
case 0xE0: | |||||
*c = (uint8_t)ch & 0x0F; | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
break; | |||||
case 0xF0: | |||||
*c = (uint8_t)ch & 0x07; | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
if ((ch = fgetc(in)) == EOF) return 0; | |||||
*c = (*c << 6) + ((uint8_t)ch & 0x3F); | |||||
break; | |||||
} | |||||
return 1; | |||||
} | |||||
void uprintf_codepoint(FILE *out, codepoint_t c, char mode) | |||||
{ | |||||
switch (mode) | |||||
{ | |||||
case 'c': // character | |||||
switch (c) | |||||
{ | |||||
case '\t': fputs("\\t", out); break; | |||||
case '\r': fputs("\\r", out); break; | |||||
case '\n': fputs("\\n", out); break; | |||||
default: fput_utf8c(out, c); break; | |||||
} | |||||
break; | |||||
case 'h': // hexadecimal (lower) | |||||
fprintf(out, "%06x", c); | |||||
break; | |||||
case 'H': // hexadecimal (upper) | |||||
fprintf(out, "%06X", c); | |||||
break; | |||||
} | |||||
} | |||||
void uprintf_is(FILE *out, codepoint_t c, char mode) | |||||
{ | |||||
switch (mode) | |||||
{ | |||||
case 'A': // alpha-numeric | |||||
fputc(iswalnum(c) ? '1' : '0', out); | |||||
break; | |||||
case 'a': // alpha | |||||
fputc(iswalpha(c) ? '1' : '0', out); | |||||
break; | |||||
case 'b': // blank | |||||
fputc(iswblank(c) ? '1' : '0', out); | |||||
break; | |||||
case 'c': // control | |||||
fputc(iswcntrl(c) ? '1' : '0', out); | |||||
break; | |||||
case 'd': // numeric | |||||
fputc(iswdigit(c) ? '1' : '0', out); | |||||
break; | |||||
case 'g': // glyph | |||||
fputc(iswgraph(c) ? '1' : '0', out); | |||||
break; | |||||
case 'l': // lower case | |||||
fputc(iswlower(c) ? '1' : '0', out); | |||||
break; | |||||
case 'P': // printable | |||||
fputc(iswprint(c) ? '1' : '0', out); | |||||
break; | |||||
case 'p': // punctuation | |||||
fputc(iswpunct(c) ? '1' : '0', out); | |||||
break; | |||||
case 's': // whitespace | |||||
fputc(iswspace(c) ? '1' : '0', out); | |||||
break; | |||||
case 'u': // upper case | |||||
fputc(iswupper(c) ? '1' : '0', out); | |||||
break; | |||||
case 'x': // xdigit | |||||
fputc(iswxdigit(c) ? '1' : '0', out); | |||||
break; | |||||
} | |||||
} | |||||
void uprintf(FILE *out, codepoint_t c, const char *format) | |||||
{ | |||||
while (*format) switch (*format) | |||||
{ | |||||
case '%': | |||||
switch (*++format) | |||||
{ | |||||
case 'c': // category | |||||
fputs(ucd_get_category_string(ucd_lookup_category(c)), out); | |||||
break; | |||||
case 'C': // category group | |||||
fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out); | |||||
break; | |||||
case 'p': // codepoint | |||||
uprintf_codepoint(out, c, *++format); | |||||
break; | |||||
case 'i': // is* | |||||
uprintf_is(out, c, *++format); | |||||
break; | |||||
case 'L': // lowercase | |||||
uprintf_codepoint(out, towlower(c), *++format); | |||||
break; | |||||
case 's': // script | |||||
fputs(ucd_get_script_string(ucd_lookup_script(c)), out); | |||||
break; | |||||
case 'T': // titlecase | |||||
uprintf_codepoint(out, ucd_totitle(c), *++format); | |||||
break; | |||||
case 'U': // uppercase | |||||
uprintf_codepoint(out, towupper(c), *++format); | |||||
break; | |||||
} | |||||
++format; | |||||
break; | |||||
case '\\': | |||||
switch (*++format) { | |||||
case 0: | |||||
break; | |||||
case 't': | |||||
fputc('\t', out); | |||||
++format; | |||||
break; | |||||
case 'r': | |||||
fputc('\r', out); | |||||
++format; | |||||
break; | |||||
case 'n': | |||||
fputc('\n', out); | |||||
++format; | |||||
break; | |||||
default: | |||||
fputc(*format, out); | |||||
++format; | |||||
break; | |||||
} | |||||
break; | |||||
default: | |||||
fputc(*format, out); | |||||
++format; | |||||
break; | |||||
} | |||||
} | |||||
void print_file(FILE *in, const char *format) | |||||
{ | |||||
codepoint_t c = 0; | |||||
while (fget_utf8c(in, &c)) | |||||
uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n"); | |||||
} | |||||
int main(int argc, char **argv) | |||||
{ | |||||
FILE *in = NULL; | |||||
const char *format = NULL; | |||||
for (int argn = 1; argn != argc; ++argn) | |||||
{ | |||||
const char *arg = argv[argn]; | |||||
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-")) | |||||
in = stdin; | |||||
else if (!strncmp(arg, "--format=", 9)) | |||||
format = arg + 9; | |||||
else if (!strncmp(arg, "--locale=", 9)) | |||||
setlocale(LC_CTYPE, arg + 9); | |||||
else if (in == NULL) | |||||
{ | |||||
in = fopen(arg, "r"); | |||||
if (!in) | |||||
fprintf(stdout, "cannot open `%s`\n", argv[1]); | |||||
} | |||||
} | |||||
if (in == stdin) | |||||
print_file(stdin, format); | |||||
else if (in != NULL) | |||||
{ | |||||
print_file(in, format); | |||||
fclose(in); | |||||
} | |||||
else | |||||
{ | |||||
for (codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||||
uprintf(stdout, c, format ? format : | |||||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||||
} | |||||
return 0; | |||||
} |
/* | /* | ||||
* Copyright (C) 2012-2016 Reece H. Dunn | |||||
* Copyright (C) 2012-2017 Reece H. Dunn | |||||
* | * | ||||
* This file is part of ucd-tools. | * This file is part of ucd-tools. | ||||
* | * | ||||
} | } | ||||
} | } | ||||
void uprintf_is(FILE *out, codepoint_t c, char mode) | |||||
{ | |||||
switch (mode) | |||||
{ | |||||
case 'A': // alpha-numeric | |||||
fputc(ucd_isalnum(c) ? '1' : '0', out); | |||||
break; | |||||
case 'a': // alpha | |||||
fputc(ucd_isalpha(c) ? '1' : '0', out); | |||||
break; | |||||
case 'b': // blank | |||||
fputc(ucd_isblank(c) ? '1' : '0', out); | |||||
break; | |||||
case 'c': // control | |||||
fputc(ucd_iscntrl(c) ? '1' : '0', out); | |||||
break; | |||||
case 'd': // numeric | |||||
fputc(ucd_isdigit(c) ? '1' : '0', out); | |||||
break; | |||||
case 'g': // glyph | |||||
fputc(ucd_isgraph(c) ? '1' : '0', out); | |||||
break; | |||||
case 'l': // lower case | |||||
fputc(ucd_islower(c) ? '1' : '0', out); | |||||
break; | |||||
case 'P': // printable | |||||
fputc(ucd_isprint(c) ? '1' : '0', out); | |||||
break; | |||||
case 'p': // punctuation | |||||
fputc(ucd_ispunct(c) ? '1' : '0', out); | |||||
break; | |||||
case 's': // whitespace | |||||
fputc(ucd_isspace(c) ? '1' : '0', out); | |||||
break; | |||||
case 'u': // upper case | |||||
fputc(ucd_isupper(c) ? '1' : '0', out); | |||||
break; | |||||
case 'x': // xdigit | |||||
fputc(ucd_isxdigit(c) ? '1' : '0', out); | |||||
break; | |||||
} | |||||
} | |||||
void uprintf(FILE *out, codepoint_t c, const char *format) | void uprintf(FILE *out, codepoint_t c, const char *format) | ||||
{ | { | ||||
while (*format) switch (*format) | while (*format) switch (*format) | ||||
case 'p': // codepoint | case 'p': // codepoint | ||||
uprintf_codepoint(out, c, *++format); | uprintf_codepoint(out, c, *++format); | ||||
break; | break; | ||||
case 'i': // is* | |||||
uprintf_is(out, c, *++format); | |||||
break; | |||||
case 'L': // lowercase | case 'L': // lowercase | ||||
uprintf_codepoint(out, ucd_tolower(c), *++format); | uprintf_codepoint(out, ucd_tolower(c), *++format); | ||||
break; | break; | ||||
case 'U': // uppercase | case 'U': // uppercase | ||||
uprintf_codepoint(out, ucd_toupper(c), *++format); | uprintf_codepoint(out, ucd_toupper(c), *++format); | ||||
break; | break; | ||||
case 'W': // whitespace | |||||
if (ucd_isspace(c)) | |||||
fputs("White_Space", out); | |||||
break; | |||||
} | } | ||||
++format; | ++format; | ||||
break; | break; | ||||
case '\\': | |||||
switch (*++format) { | |||||
case 0: | |||||
break; | |||||
case 't': | |||||
fputc('\t', out); | |||||
++format; | |||||
break; | |||||
case 'r': | |||||
fputc('\r', out); | |||||
++format; | |||||
break; | |||||
case 'n': | |||||
fputc('\n', out); | |||||
++format; | |||||
break; | |||||
default: | |||||
fputc(*format, out); | |||||
++format; | |||||
break; | |||||
} | |||||
break; | |||||
default: | default: | ||||
fputc(*format, out); | fputc(*format, out); | ||||
++format; | ++format; | ||||
} | } | ||||
} | } | ||||
void print_file(FILE *in) | |||||
void print_file(FILE *in, const char *format) | |||||
{ | { | ||||
codepoint_t c = 0; | codepoint_t c = 0; | ||||
while (fget_utf8c(in, &c)) | while (fget_utf8c(in, &c)) | ||||
uprintf(stdout, c, "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n"); | |||||
uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n"); | |||||
} | } | ||||
int main(int argc, char **argv) | int main(int argc, char **argv) | ||||
{ | { | ||||
if (argc == 2) | |||||
FILE *in = NULL; | |||||
const char *format = NULL; | |||||
for (int argn = 1; argn != argc; ++argn) | |||||
{ | { | ||||
if (!strcmp(argv[1], "--stdin") || !strcmp(argv[1], "-")) | |||||
print_file(stdin); | |||||
else | |||||
const char *arg = argv[argn]; | |||||
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-")) | |||||
in = stdin; | |||||
else if (!strncmp(arg, "--format=", 9)) | |||||
format = arg + 9; | |||||
else if (in == NULL) | |||||
{ | { | ||||
FILE *in = fopen(argv[1], "r"); | |||||
if (in) | |||||
{ | |||||
print_file(in); | |||||
fclose(in); | |||||
} | |||||
else | |||||
in = fopen(arg, "r"); | |||||
if (!in) | |||||
fprintf(stdout, "cannot open `%s`\n", argv[1]); | fprintf(stdout, "cannot open `%s`\n", argv[1]); | ||||
} | } | ||||
} | } | ||||
if (in == stdin) | |||||
print_file(stdin, format); | |||||
else if (in != NULL) | |||||
{ | |||||
print_file(in, format); | |||||
fclose(in); | |||||
} | |||||
else | else | ||||
{ | { | ||||
for (codepoint_t c = 0; c <= 0x10FFFF; ++c) | for (codepoint_t c = 0; c <= 0x10FFFF; ++c) | ||||
uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n"); | |||||
uprintf(stdout, c, format ? format : | |||||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||||
} | } | ||||
return 0; | return 0; | ||||
} | } |
/* | /* | ||||
* Copyright (C) 2012-2016 Reece H. Dunn | |||||
* Copyright (C) 2012-2017 Reece H. Dunn | |||||
* | * | ||||
* This file is part of ucd-tools. | * This file is part of ucd-tools. | ||||
* | * | ||||
} | } | ||||
} | } | ||||
void uprintf_is(FILE *out, ucd::codepoint_t c, char mode) | |||||
{ | |||||
switch (mode) | |||||
{ | |||||
case 'A': // alpha-numeric | |||||
fputc(ucd::isalnum(c) ? '1' : '0', out); | |||||
break; | |||||
case 'a': // alpha | |||||
fputc(ucd::isalpha(c) ? '1' : '0', out); | |||||
break; | |||||
case 'b': // blank | |||||
fputc(ucd::isblank(c) ? '1' : '0', out); | |||||
break; | |||||
case 'c': // control | |||||
fputc(ucd::iscntrl(c) ? '1' : '0', out); | |||||
break; | |||||
case 'd': // numeric | |||||
fputc(ucd::isdigit(c) ? '1' : '0', out); | |||||
break; | |||||
case 'g': // glyph | |||||
fputc(ucd::isgraph(c) ? '1' : '0', out); | |||||
break; | |||||
case 'l': // lower case | |||||
fputc(ucd::islower(c) ? '1' : '0', out); | |||||
break; | |||||
case 'P': // printable | |||||
fputc(ucd::isprint(c) ? '1' : '0', out); | |||||
break; | |||||
case 'p': // punctuation | |||||
fputc(ucd::ispunct(c) ? '1' : '0', out); | |||||
break; | |||||
case 's': // whitespace | |||||
fputc(ucd::isspace(c) ? '1' : '0', out); | |||||
break; | |||||
case 'u': // upper case | |||||
fputc(ucd::isupper(c) ? '1' : '0', out); | |||||
break; | |||||
case 'x': // xdigit | |||||
fputc(ucd::isxdigit(c) ? '1' : '0', out); | |||||
break; | |||||
} | |||||
} | |||||
void uprintf(FILE *out, ucd::codepoint_t c, const char *format) | void uprintf(FILE *out, ucd::codepoint_t c, const char *format) | ||||
{ | { | ||||
while (*format) switch (*format) | while (*format) switch (*format) | ||||
case 'p': // codepoint | case 'p': // codepoint | ||||
uprintf_codepoint(out, c, *++format); | uprintf_codepoint(out, c, *++format); | ||||
break; | break; | ||||
case 'i': // is* | |||||
uprintf_is(out, c, *++format); | |||||
break; | |||||
case 'L': // lowercase | case 'L': // lowercase | ||||
uprintf_codepoint(out, ucd::tolower(c), *++format); | uprintf_codepoint(out, ucd::tolower(c), *++format); | ||||
break; | break; | ||||
case 'U': // uppercase | case 'U': // uppercase | ||||
uprintf_codepoint(out, ucd::toupper(c), *++format); | uprintf_codepoint(out, ucd::toupper(c), *++format); | ||||
break; | break; | ||||
case 'W': // whitespace | |||||
if (ucd::isspace(c)) | |||||
fputs("White_Space", out); | |||||
break; | |||||
} | } | ||||
++format; | ++format; | ||||
break; | break; | ||||
case '\\': | |||||
switch (*++format) { | |||||
case 0: | |||||
break; | |||||
case 't': | |||||
fputc('\t', out); | |||||
++format; | |||||
break; | |||||
case 'r': | |||||
fputc('\r', out); | |||||
++format; | |||||
break; | |||||
case 'n': | |||||
fputc('\n', out); | |||||
++format; | |||||
break; | |||||
default: | |||||
fputc(*format, out); | |||||
++format; | |||||
break; | |||||
} | |||||
break; | |||||
default: | default: | ||||
fputc(*format, out); | fputc(*format, out); | ||||
++format; | ++format; | ||||
} | } | ||||
} | } | ||||
void print_file(FILE *in) | |||||
void print_file(FILE *in, const char *format) | |||||
{ | { | ||||
ucd::codepoint_t c = 0; | ucd::codepoint_t c = 0; | ||||
while (fget_utf8c(in, c)) | while (fget_utf8c(in, c)) | ||||
uprintf(stdout, c, "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n"); | |||||
uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n"); | |||||
} | } | ||||
int main(int argc, char **argv) | int main(int argc, char **argv) | ||||
{ | { | ||||
if (argc == 2) | |||||
FILE *in = NULL; | |||||
const char *format = NULL; | |||||
for (int argn = 1; argn != argc; ++argn) | |||||
{ | { | ||||
if (!strcmp(argv[1], "--stdin") || !strcmp(argv[1], "-")) | |||||
print_file(stdin); | |||||
else | |||||
const char *arg = argv[argn]; | |||||
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-")) | |||||
in = stdin; | |||||
else if (!strncmp(arg, "--format=", 9)) | |||||
format = arg + 9; | |||||
else if (in == NULL) | |||||
{ | { | ||||
FILE *in = fopen(argv[1], "r"); | |||||
if (in) | |||||
{ | |||||
print_file(in); | |||||
fclose(in); | |||||
} | |||||
else | |||||
in = fopen(arg, "r"); | |||||
if (!in) | |||||
fprintf(stdout, "cannot open `%s`\n", argv[1]); | fprintf(stdout, "cannot open `%s`\n", argv[1]); | ||||
} | } | ||||
} | } | ||||
if (in == stdin) | |||||
print_file(stdin, format); | |||||
else if (in != NULL) | |||||
{ | |||||
print_file(in, format); | |||||
fclose(in); | |||||
} | |||||
else | else | ||||
{ | { | ||||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | ||||
uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n"); | |||||
uprintf(stdout, c, format ? format : | |||||
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n"); | |||||
} | } | ||||
return 0; | return 0; | ||||
} | } |
#!/usr/bin/python | #!/usr/bin/python | ||||
# Copyright (C) 2012 Reece H. Dunn | |||||
# Copyright (C) 2012-2017 Reece H. Dunn | |||||
# | # | ||||
# This file is part of ucd-tools. | # This file is part of ucd-tools. | ||||
# | # | ||||
ucd_rootdir = sys.argv[1] | ucd_rootdir = sys.argv[1] | ||||
csur_rootdir = 'data/csur' | csur_rootdir = 'data/csur' | ||||
null = ucd.CodePoint('0000') | |||||
unicode_chars = {} | unicode_chars = {} | ||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
for codepoint in data['CodePoint']: | for codepoint in data['CodePoint']: | ||||
unicode_chars[codepoint] = data | unicode_chars[codepoint] = data | ||||
unicode_chars[codepoint]['Properties'] = [] | |||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||||
if data['Property'] in ['White_Space']: | |||||
for propfile in ['PropList', 'DerivedCoreProperties']: | |||||
for data in ucd.parse_ucd_data(ucd_rootdir, propfile): | |||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint]['Properties'].append(data['Property']) | |||||
try: | |||||
unicode_chars[codepoint][data['Property']] = 1 | |||||
except KeyError: | |||||
unicode_chars[codepoint] = {'CodePoint': codepoint} | |||||
unicode_chars[codepoint][data['Property']] = 1 | |||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint]['Script'] = data['Script'] | unicode_chars[codepoint]['Script'] = data['Script'] | ||||
for csur in ['Klingon']: | for csur in ['Klingon']: | ||||
for data in ucd.parse_ucd_data('data/csur', csur): | for data in ucd.parse_ucd_data('data/csur', csur): | ||||
for codepoint in data['CodePoint']: | for codepoint in data['CodePoint']: | ||||
if not 'TitleCase' in data: data['TitleCase'] = codepoint | |||||
if not 'UpperCase' in data: data['UpperCase'] = codepoint | |||||
if not 'LowerCase' in data: data['LowerCase'] = codepoint | |||||
if not 'Properties' in data: data['Properties'] = [] | |||||
unicode_chars[codepoint] = data | unicode_chars[codepoint] = data | ||||
null = ucd.CodePoint('0000') | |||||
def iscntrl(data): | |||||
return 1 if data.get('Name', '') == '<control>' else 0 | |||||
def isdigit(data): | |||||
return 1 if data['CodePoint'].char() in '0123456789' else 0 | |||||
def isxdigit(data): | |||||
return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0 | |||||
def isspace(data): | |||||
if data.get('White_Space', 0): | |||||
dt = data.get('DecompositionType', '') | |||||
return 1 if dt == None or not dt.startswith('<noBreak>') else 0 | |||||
else: | |||||
return 0 | |||||
def isblank(data): # word separator | |||||
if data.get('GeneralCategory', 'Cn') == 'Zs' or data['CodePoint'].char() == '\t': | |||||
dt = data.get('DecompositionType', '') | |||||
return 1 if dt == None or not dt.startswith('<noBreak>') else 0 | |||||
else: | |||||
return 0 | |||||
def ispunct(data): | |||||
if data.get('GeneralCategory', 'Cn')[0] in 'P': | |||||
return 1 | |||||
else: | |||||
return 0 | |||||
def isprint(data): | |||||
if data.get('GeneralCategory', 'Cn')[0] in 'LMNPSZ': # not in 'CI' | |||||
return 1 | |||||
else: | |||||
return 0 | |||||
def isgraph(data): | |||||
if data.get('GeneralCategory', 'Cn')[0] in 'LMNPS': # not in 'CZI' | |||||
return 1 | |||||
else: | |||||
return 0 | |||||
def isalnum(data): | |||||
if data.get('GeneralCategory', 'Cn')[0] in 'N': | |||||
return 1 | |||||
else: | |||||
return data.get('Alphabetic', 0) | |||||
def isalpha(data): | |||||
return data.get('Alphabetic', 0) | |||||
def isupper(data): | |||||
if data.get('Uppercase', 0): | |||||
return 1 | |||||
elif data.get('LowerCase', null) != null: # Some Lt characters have lowercase forms. | |||||
return 1 | |||||
else: | |||||
return 0 | |||||
def islower(data): | |||||
if data.get('Lowercase', 0): | |||||
return 1 | |||||
elif data.get('UpperCase', null) != null: | |||||
return 1 | |||||
else: | |||||
return 0 | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
for codepoint in ucd.CodeRange('000000..10FFFF'): | for codepoint in ucd.CodeRange('000000..10FFFF'): | ||||
try: | try: | ||||
data = unicode_chars[codepoint] | data = unicode_chars[codepoint] | ||||
except KeyError: | except KeyError: | ||||
data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []} | |||||
try: | |||||
script = data['Script'] | |||||
except KeyError: | |||||
script = 'Zzzz' | |||||
title = data['TitleCase'] | |||||
upper = data['UpperCase'] | |||||
lower = data['LowerCase'] | |||||
data = {'CodePoint': codepoint} | |||||
script = data.get('Script', 'Zzzz') | |||||
title = data.get('TitleCase', codepoint) | |||||
upper = data.get('UpperCase', codepoint) | |||||
lower = data.get('LowerCase', codepoint) | |||||
if title == null: title = codepoint | if title == null: title = codepoint | ||||
if upper == null: upper = codepoint | if upper == null: upper = codepoint | ||||
if lower == null: lower = codepoint | if lower == null: lower = codepoint | ||||
print('%s %s %s %s %s %s %s %s' % ( | |||||
print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % ( | |||||
codepoint, script, | codepoint, script, | ||||
data['GeneralCategory'][0], data['GeneralCategory'], | |||||
data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'), | |||||
upper, lower, title, | upper, lower, title, | ||||
' '.join(data['Properties']))) | |||||
isdigit(data), isxdigit(data), | |||||
iscntrl(data), isspace(data), isblank(data), ispunct(data), | |||||
isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data))) |
#!/usr/bin/python | #!/usr/bin/python | ||||
# Copyright (C) 2012-2014 Reece H. Dunn | |||||
# Copyright (C) 2012-2017 Reece H. Dunn | |||||
# | # | ||||
# This file is part of ucd-tools. | # This file is part of ucd-tools. | ||||
# | # | ||||
def __lt__(self, other): | def __lt__(self, other): | ||||
return self.codepoint < other.codepoint | return self.codepoint < other.codepoint | ||||
def char(self): | |||||
return unichr(self.codepoint) | |||||
class CodeRange: | class CodeRange: | ||||
def __init__(self, x): | def __init__(self, x): | ||||
f, l = x.split('..') | f, l = x.split('..') | ||||
def size(self): | def size(self): | ||||
return self.last.codepoint - self.first.codepoint + 1 | return self.last.codepoint - self.first.codepoint + 1 | ||||
def char(self): | |||||
return unichr(self.first.codepoint) | |||||
def codepoint(x): | def codepoint(x): | ||||
if '..' in x[0]: | if '..' in x[0]: | ||||
return CodeRange(x[0]), x[1:] | return CodeRange(x[0]), x[1:] | ||||
('Range', codepoint), | ('Range', codepoint), | ||||
('Age', string), | ('Age', string), | ||||
], | ], | ||||
'DerivedCoreProperties': [ | |||||
('Range', codepoint), | |||||
('Property', string), | |||||
], | |||||
'PropList': [ | 'PropList': [ | ||||
('Range', codepoint), | ('Range', codepoint), | ||||
('Property', string), | ('Property', string), |