8 years ago · d2a919bc7b
--- a/src/ucd-tools/.gitignore
+++ b/src/ucd-tools/.gitignore
@@ -15,6 +15,7 @@
 data/ucd

 src/libucd.la
 tests/printcdata
 tests/printucddata
 tests/printucddata_cpp

--- a/src/ucd-tools/CHANGELOG.md
+++ b/src/ucd-tools/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Change Log

 ## 9.0.0.1 - (In Progress)

 *  Add `iswblank` and `iswxdigit` compatibility.
 *  Improve ctype compatibility.

 ## 9.0.0 - 2016-12-28

 *  Update to Unicode Character Data 9.0.0.
--- a/src/ucd-tools/Makefile.am
+++ b/src/ucd-tools/Makefile.am
@@ -57,22 +57,27 @@ EXTRA_DIST += ChangeLog

 UCD_VERSION=@UCD_VERSION@
 UCD_ROOTDIR=data/ucd
 UCD_SRCDIR=http://www.unicode.org/Public

 data/ucd/PropList.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt
 	curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropList.txt > $@

 data/ucd/DerivedCoreProperties.txt:
 	mkdir -pv data/ucd
 	curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/DerivedCoreProperties.txt > $@

 data/ucd/PropertyValueAliases.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropertyValueAliases.txt
 	curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropertyValueAliases.txt > $@

 data/ucd/Scripts.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt
 	curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/Scripts.txt > $@

 data/ucd/UnicodeData.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt
 	curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/UnicodeData.txt > $@

 ############################# documentation ###################################

@@ -122,6 +127,10 @@ src_libucd_la_SOURCES  = \

 ############################# tests ###########################################

 noinst_bin_PROGRAMS     += tests/printcdata
 tests_printcdata_SOURCES = tests/printcdata.c
 tests_printcdata_LDADD   = src/libucd.la

 noinst_bin_PROGRAMS       += tests/printucddata
 tests_printucddata_SOURCES = tests/printucddata.c
 tests_printucddata_LDADD   = src/libucd.la
@@ -133,6 +142,7 @@ tests_printucddata_cpp_LDADD   = src/libucd.la
 tests/unicode-data.expected: tools/printdata.py tools/ucd.py \
 	data/ucd/UnicodeData.txt \
 	data/ucd/PropList.txt \
 	data/ucd/DerivedCoreProperties.txt \
 	data/ucd/Scripts.txt
 	tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@

--- a/src/ucd-tools/README.md
+++ b/src/ucd-tools/README.md
@@ -92,20 +92,20 @@ C library provides a set of APIs that are compatible with `wctype.h`.

 The following character classification functions are provided:

 | C API         | C++ API        |
 |---------------|----------------|
 | `ucd_isalnum` | `ucd::isalnum` |
 | `ucd_isalpha` | `ucd::isalpha` |
 | `ucd_iscntrl` | `ucd::iscntrl` |
 | `ucd_isdigit` | `ucd::isdigit` |
 | `ucd_isgraph` | `ucd::isgraph` |
 | `ucd_islower` | `ucd::islower` |
 | `ucd_isprint` | `ucd::isprint` |
 | `ucd_ispunct` | `ucd::ispunct` |
 | `ucd_isspace` | `ucd::isspace` |
 | `ucd_isupper` | `ucd::isupper` |

 __NOTE:__ Equivalents for `isblank` and `isxdigit` are not provided.
 | C API          | C++ API         |
 |----------------|-----------------|
 | `ucd_isalnum`  | `ucd::isalnum`  |
 | `ucd_isalpha`  | `ucd::isalpha`  |
 | `ucd_isblank`  | `ucd::isblank`  |
 | `ucd_iscntrl`  | `ucd::iscntrl`  |
 | `ucd_isdigit`  | `ucd::isdigit`  |
 | `ucd_isgraph`  | `ucd::isgraph`  |
 | `ucd_islower`  | `ucd::islower`  |
 | `ucd_isprint`  | `ucd::isprint`  |
 | `ucd_ispunct`  | `ucd::ispunct`  |
 | `ucd_isspace`  | `ucd::isspace`  |
 | `ucd_isupper`  | `ucd::isupper`  |
 | `ucd_isxdigit` | `ucd::isxdigit` |

 ## Build Dependencies

--- a/src/ucd-tools/src/ctype.c
+++ b/src/ucd-tools/src/ctype.c
@@ -1,6 +1,6 @@
 /* ctype-style APIs.
 *
 * Copyright (C) 2012-2016 Reece H. Dunn
 * Copyright (C) 2012-2017 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
@@ -20,19 +20,276 @@

 #include "ucd/ucd.h"

 static int other_alphabetic_MnMcSo(codepoint_t c)
 {
 	switch (c & 0xFFFFFF00)
 	{
 	case 0x0300:
 		return c == 0x0345;
 	case 0x0500:
 		return (c >= 0x05B0 && c <= 0x05BD)
 		    ||  c == 0x05BF
 		    || (c >= 0x05C1 && c <= 0x05C2)
 		    || (c >= 0x05C4 && c <= 0x05C5)
 		    ||  c == 0x05C7;
 	case 0x0600:
 		return (c >= 0x0610 && c <= 0x061A)
 		    || (c >= 0x064B && c <= 0x0657)
 		    || (c >= 0x0659 && c <= 0x065F)
 		    ||  c == 0x0670
 		    || (c >= 0x06D6 && c <= 0x06DC)
 		    || (c >= 0x06E1 && c <= 0x06E4)
 		    || (c >= 0x06E7 && c <= 0x06E8)
 		    ||  c == 0x06ED;
 	case 0x0700:
 		return  c == 0x0711
 		    || (c >= 0x0730 && c <= 0x073F)
 		    || (c >= 0x07A6 && c <= 0x07B0);
 	case 0x0800:
 		return (c >= 0x0816 && c <= 0x0817)
 		    || (c >= 0x081B && c <= 0x0823)
 		    || (c >= 0x0825 && c <= 0x0827)
 		    || (c >= 0x0829 && c <= 0x082C)
 		    || (c >= 0x08D4 && c <= 0x08DF)
 		    || (c >= 0x08E3 && c <= 0x08E9)
 		    ||  c >= 0x08F0;
 	case 0x0900:
 		return (c >= 0x0900 && c <= 0x0903)
 		    || (c >= 0x093A && c <= 0x093B)
 		    || (c >= 0x093E && c <= 0x094C)
 		    || (c >= 0x094E && c <= 0x094F)
 		    || (c >= 0x0955 && c <= 0x0957)
 		    || (c >= 0x0962 && c <= 0x0963)
 		    || (c >= 0x0981 && c <= 0x0983)
 		    || (c >= 0x0981 && c <= 0x0983)
 		    || (c >= 0x09BE && c <= 0x09C4)
 		    || (c >= 0x09C7 && c <= 0x09C8)
 		    || (c >= 0x09CB && c <= 0x09CC)
 		    ||  c == 0x09D7
 		    || (c >= 0x09E2 && c <= 0x09E3);
 	case 0x0A00:
 		return (c >= 0x0A01 && c <= 0x0A03)
 		    || (c >= 0x0A3E && c <= 0x0A42)
 		    || (c >= 0x0A47 && c <= 0x0A48)
 		    || (c >= 0x0A4B && c <= 0x0A4C)
 		    ||  c == 0x0A51
 		    || (c >= 0x0A70 && c <= 0x0A71)
 		    ||  c == 0x0A75
 		    || (c >= 0x0A81 && c <= 0x0A83)
 		    || (c >= 0x0ABE && c <= 0x0AC5)
 		    || (c >= 0x0AC7 && c <= 0x0AC9)
 		    || (c >= 0x0ACB && c <= 0x0ACC)
 		    || (c >= 0x0AE2 && c <= 0x0AE3);
 	case 0x0B00:
 		return (c >= 0x0B01 && c <= 0x0B03)
 		    || (c >= 0x0B3E && c <= 0x0B44)
 		    || (c >= 0x0B47 && c <= 0x0B48)
 		    || (c >= 0x0B4B && c <= 0x0B4C)
 		    || (c >= 0x0B56 && c <= 0x0B57)
 		    || (c >= 0x0B62 && c <= 0x0B63)
 		    ||  c == 0x0B82
 		    || (c >= 0x0BBE && c <= 0x0BC2)
 		    || (c >= 0x0BC6 && c <= 0x0BC8)
 		    || (c >= 0x0BCA && c <= 0x0BCC)
 		    ||  c == 0x0BD7;
 	case 0x0C00:
 		return (c >= 0x0C00 && c <= 0x0C03)
 		    || (c >= 0x0C3E && c <= 0x0C44)
 		    || (c >= 0x0C46 && c <= 0x0C48)
 		    || (c >= 0x0C4A && c <= 0x0C4C)
 		    || (c >= 0x0C55 && c <= 0x0C56)
 		    || (c >= 0x0C62 && c <= 0x0C63)
 		    || (c >= 0x0C81 && c <= 0x0C83)
 		    || (c >= 0x0CBE && c <= 0x0CBF)
 		    || (c >= 0x0CC0 && c <= 0x0CC4)
 		    || (c >= 0x0CC6 && c <= 0x0CC8)
 		    || (c >= 0x0CCA && c <= 0x0CCC)
 		    || (c >= 0x0CD5 && c <= 0x0CD6)
 		    || (c >= 0x0CE2 && c <= 0x0CE3);
 	case 0x0D00:
 		return (c >= 0x0D01 && c <= 0x0D03)
 		    || (c >= 0x0D3E && c <= 0x0D44)
 		    || (c >= 0x0D46 && c <= 0x0D48)
 		    || (c >= 0x0D4A && c <= 0x0D4C)
 		    ||  c == 0x0D57
 		    || (c >= 0x0D62 && c <= 0x0D63)
 		    || (c >= 0x0D82 && c <= 0x0D83)
 		    || (c >= 0x0DCF && c <= 0x0DD4)
 		    ||  c == 0x0DD6
 		    || (c >= 0x0DD8 && c <= 0x0DDF)
 		    || (c >= 0x0DF2 && c <= 0x0DF3);
 	case 0x0E00:
 		return  c == 0x0E31
 		    || (c >= 0x0E34 && c <= 0x0E3A)
 		    ||  c == 0x0E4D
 		    ||  c == 0x0EB1
 		    || (c >= 0x0EB4 && c <= 0x0EB9)
 		    || (c >= 0x0EBB && c <= 0x0EBD)
 		    ||  c == 0x0ECD;
 	case 0x0F00:
 		return (c >= 0x0F71 && c <= 0x0F7F)
 		    || (c >= 0x0F80 && c <= 0x0F81)
 		    || (c >= 0x0F8D && c <= 0x0F97)
 		    || (c >= 0x0F99 && c <= 0x0FBC);
 	case 0x1000:
 		return (c >= 0x102B && c <= 0x1036)
 		    ||  c == 0x1038
 		    || (c >= 0x103B && c <= 0x103E)
 		    || (c >= 0x1056 && c <= 0x1059)
 		    || (c >= 0x105E && c <= 0x1060)
 		    ||  c == 0x1062
 		    || (c >= 0x1067 && c <= 0x1068)
 		    || (c >= 0x1071 && c <= 0x1074)
 		    || (c >= 0x1082 && c <= 0x1086)
 		    || (c >= 0x109C && c <= 0x109D);
 	case 0x1300:
 		return c == 0x135F;
 	case 0x1700:
 		return (c >= 0x1712 && c <= 0x1713)
 		    || (c >= 0x1732 && c <= 0x1733)
 		    || (c >= 0x1752 && c <= 0x1753)
 		    || (c >= 0x1772 && c <= 0x1773)
 		    || (c >= 0x17B6 && c <= 0x17C8);
 	case 0x1800:
 		return (c >= 0x1885 && c <= 0x1886)
 		    ||  c == 0x18A9;
 	case 0x1900:
 		return (c >= 0x1920 && c <= 0x192B)
 		    || (c >= 0x1930 && c <= 0x1938);
 	case 0x1A00:
 		return (c >= 0x1A17 && c <= 0x1A1B)
 		    || (c >= 0x1A55 && c <= 0x1A5E)
 		    || (c >= 0x1A61 && c <= 0x1A74);
 	case 0x1B00:
 		return (c >= 0x1B00 && c <= 0x1B04)
 		    || (c >= 0x1B35 && c <= 0x1B43)
 		    || (c >= 0x1B80 && c <= 0x1B82)
 		    || (c >= 0x1BA1 && c <= 0x1BA9)
 		    || (c >= 0x1BAC && c <= 0x1BAD)
 		    || (c >= 0x1BE7 && c <= 0x1BF1);
 	case 0x1C00:
 		return (c >= 0x1C24 && c <= 0x1C35)
 		    || (c >= 0x1CF2 && c <= 0x1CF3);
 	case 0x1D00:
 		return (c >= 0x1DE7 && c <= 0x1DF4);
 	case 0x2400:
 		return (c >= 0x24B6 && c <= 0x24E9);
 	case 0x2D00:
 		return (c >= 0x2DE0 && c <= 0x2DFF);
 	case 0xA600:
 		return (c >= 0xA674 && c <= 0xA67B)
 		    || (c >= 0xA69E && c <= 0xA69F);
 	case 0xA800:
 		return (c >= 0xA823 && c <= 0xA827)
 		    || (c >= 0xA880 && c <= 0xA881)
 		    || (c >= 0xA8B4 && c <= 0xA8C3)
 		    ||  c == 0xA8C5;
 	case 0xA900:
 		return (c >= 0xA926 && c <= 0xA92A)
 		    || (c >= 0xA947 && c <= 0xA952)
 		    || (c >= 0xA980 && c <= 0xA983)
 		    || (c >= 0xA9B4 && c <= 0xA9BF);
 	case 0xAA00:
 		return (c >= 0xAA29 && c <= 0xAA36)
 		    ||  c == 0xAA43
 		    || (c >= 0xAA4C && c <= 0xAA4D)
 		    ||  c == 0xAAB0
 		    || (c >= 0xAAB2 && c <= 0xAAB4)
 		    || (c >= 0xAAB7 && c <= 0xAAB8)
 		    ||  c == 0xAABE
 		    || (c >= 0xAAEB && c <= 0xAAEF)
 		    ||  c == 0xAAF5;
 	case 0xAB00:
 		return (c >= 0xABE3 && c <= 0xABEA);
 	case 0xFB00:
 		return c == 0xFB1E;
 	case 0x10300:
 		return (c >= 0x10376 && c <= 0x1037A);
 	case 0x10A00:
 		return (c >= 0x10A01 && c <= 0x10A03)
 		    || (c >= 0x10A05 && c <= 0x10A06)
 		    || (c >= 0x10A0C && c <= 0x10A0F);
 	case 0x11000:
 		return (c >= 0x11000 && c <= 0x11002)
 		    || (c >= 0x11038 && c <= 0x11045)
 		    ||  c == 0x11082
 		    || (c >= 0x110B0 && c <= 0x110B8);
 	case 0x11100:
 		return (c >= 0x11100 && c <= 0x11102)
 		    || (c >= 0x11127 && c <= 0x11132)
 		    || (c >= 0x11180 && c <= 0x11182)
 		    || (c >= 0x111B3 && c <= 0x111BF);
 	case 0x11200:
 		return (c >= 0x1122C && c <= 0x11234)
 		    ||  c == 0x11237
 		    ||  c == 0x1123E
 		    || (c >= 0x112DF && c <= 0x112E8);
 	case 0x11300:
 		return (c >= 0x11300 && c <= 0x11303)
 		    || (c >= 0x1133E && c <= 0x11344)
 		    || (c >= 0x11347 && c <= 0x11348)
 		    || (c >= 0x1134B && c <= 0x1134C)
 		    ||  c == 0x11357
 		    || (c >= 0x11362 && c <= 0x11363);
 	case 0x11400:
 		return (c >= 0x11435 && c <= 0x11441)
 		    || (c >= 0x11443 && c <= 0x11445)
 		    || (c >= 0x114B0 && c <= 0x114C1);
 	case 0x11500:
 		return (c >= 0x115AF && c <= 0x115B5)
 		    || (c >= 0x115B8 && c <= 0x115BE)
 		    || (c >= 0x115DC && c <= 0x115DD);
 	case 0x11600:
 		return (c >= 0x11630 && c <= 0x1163E)
 		    ||  c == 0x11640
 		    || (c >= 0x116AB && c <= 0x116B5);
 	case 0x11700:
 		return (c >= 0x1171D && c <= 0x1172A);
 	case 0x11C00:
 		return (c >= 0x11C2F && c <= 0x11C36)
 		    || (c >= 0x11C38 && c <= 0x11C3E)
 		    || (c >= 0x11C92 && c <= 0x11CA7)
 		    || (c >= 0x11CA9 && c <= 0x11CB6);
 	case 0x16B00:
 		return (c >= 0x16B30 && c <= 0x16B36);
 	case 0x16F00:
 		return (c >= 0x16F51 && c <= 0x16F7E);
 	case 0x1BC00:
 		return c == 0x1BC9E;
 	case 0x1E000:
 		return (c >= 0x1E000 && c <= 0x1E006)
 		    || (c >= 0x1E008 && c <= 0x1E018)
 		    || (c >= 0x1E01B && c <= 0x1E021)
 		    || (c >= 0x1E023 && c <= 0x1E024)
 		    || (c >= 0x1E026 && c <= 0x1E02A);
 	case 0x1E900:
 		return c == 0x1E947;
 	case 0x1F100:
 		return (c >= 0x01F130 && c <= 0x01F149)
 		    || (c >= 0x01F150 && c <= 0x01F169)
 		    || (c >= 0x01F170 && c <= 0x01F189);
 	default:
 		return 0;
 	}
 }

 int ucd_isalnum(codepoint_t c)
 {
 	switch (ucd_lookup_category(c))
 	{
 	case UCD_CATEGORY_Lu:
 	case UCD_CATEGORY_Ll:
 	case UCD_CATEGORY_Lt:
 	case UCD_CATEGORY_Lm:
 	case UCD_CATEGORY_Lo:
 	case UCD_CATEGORY_Lt:
 	case UCD_CATEGORY_Lu:
 	case UCD_CATEGORY_Nd:
 	case UCD_CATEGORY_Nl:
 	case UCD_CATEGORY_Nd:
 	case UCD_CATEGORY_No:
 		return 1;
 	case UCD_CATEGORY_Mn:
 	case UCD_CATEGORY_Mc:
 	case UCD_CATEGORY_So:
 		return other_alphabetic_MnMcSo(c);
 	default:
 		return 0;
 	}
@@ -42,35 +299,52 @@ int ucd_isalpha(codepoint_t c)
 {
 	switch (ucd_lookup_category(c))
 	{
 	case UCD_CATEGORY_Lu:
 	case UCD_CATEGORY_Ll:
 	case UCD_CATEGORY_Lt:
 	case UCD_CATEGORY_Lm:
 	case UCD_CATEGORY_Lo:
 	case UCD_CATEGORY_Lt:
 	case UCD_CATEGORY_Lu:
 	case UCD_CATEGORY_Nl:
 		return 1;
 	case UCD_CATEGORY_Mn:
 	case UCD_CATEGORY_Mc:
 	case UCD_CATEGORY_So:
 		return other_alphabetic_MnMcSo(c);
 	default:
 		return 0;
 	}
 }

 int ucd_iscntrl(codepoint_t c)
 {
 	return ucd_lookup_category(c) == UCD_CATEGORY_Cc;
 }

 int ucd_isdigit(codepoint_t c)
 int ucd_isblank(codepoint_t c)
 {
 	switch (ucd_lookup_category(c))
 	{
 	case UCD_CATEGORY_Nd:
 	case UCD_CATEGORY_Nl:
 	case UCD_CATEGORY_No:
 	case UCD_CATEGORY_Zs:
 		switch (c) // Exclude characters with the <noBreak> DispositionType
 		{
 		case 0x00A0: // U+00A0 : NO-BREAK SPACE
 		case 0x2007: // U+2007 : FIGURE SPACE
 		case 0x202F: // U+202F : NARROW NO-BREAK SPACE
 			return 0;
 		}
 		return 1;
 	case UCD_CATEGORY_Cc:
 		return c == 0x09; // U+0009 : CHARACTER TABULATION
 	default:
 		return 0;
 	}
 }

 int ucd_iscntrl(codepoint_t c)
 {
 	return ucd_lookup_category(c) == UCD_CATEGORY_Cc;
 }

 int ucd_isdigit(codepoint_t c)
 {
 	return (c >= 0x30 && c <= 0x39); // [0-9]
 }

 int ucd_isgraph(codepoint_t c)
 {
 	switch (ucd_lookup_category(c))
@@ -92,7 +366,40 @@ int ucd_isgraph(codepoint_t c)

 int ucd_islower(codepoint_t c)
 {
 	return ucd_lookup_category(c) == UCD_CATEGORY_Ll;
 	switch (ucd_lookup_category(c))
 	{
 	case UCD_CATEGORY_Ll:
 		return 1;
 	case UCD_CATEGORY_Lt:
 		return ucd_toupper(c) != c;
 	case UCD_CATEGORY_Lo:
 		return c == 0xAA  // Other_Lowercase : FEMININE ORDINAL INDICATOR
 		    || c == 0xBA; // Other_Lowercase : MASCULINE ORDINAL INDICATOR
 	case UCD_CATEGORY_Lm:
 		return (c >= 0x02B0 && c <= 0x02B8)  // Other_Lowercase
 		    || (c >= 0x02C0 && c <= 0x02C1)  // Other_Lowercase
 		    || (c >= 0x02E0 && c <= 0x02E4)  // Other_Lowercase
 		    ||  c == 0x037A                  // Other_Lowercase
 		    || (c >= 0x1D2C && c <= 0x1D6A)  // Other_Lowercase
 		    ||  c == 0x1D78                  // Other_Lowercase
 		    || (c >= 0x1D9B && c <= 0x1DBF)  // Other_Lowercase
 		    ||  c == 0x2071                  // Other_Lowercase
 		    ||  c == 0x207F                  // Other_Lowercase
 		    || (c >= 0x2090 && c <= 0x209C)  // Other_Lowercase
 		    || (c >= 0x2C7C && c <= 0x2C7D)  // Other_Lowercase
 		    || (c >= 0xA69C && c <= 0xA69D)  // Other_Lowercase
 		    ||  c == 0xA770                  // Other_Lowercase
 		    || (c >= 0xA7F8 && c <= 0xA7F9)  // Other_Lowercase
 		    || (c >= 0xAB5C && c <= 0xAB5F); // Other_Lowercase
 	case UCD_CATEGORY_Mn:
 		return c == 0x0345; // Other_Lowercase : COMBINING GREEK YPOGEGRAMMENI
 	case UCD_CATEGORY_Nl:
 		return (c >= 0x2170 && c <= 0x217F); // Other_Lowercase
 	case UCD_CATEGORY_So:
 		return (c >= 0x24D0 && c <= 0x24E9); // Other_Lowercase
 	default:
 		return 0;
 	}
 }

 int ucd_isprint(codepoint_t c)
@@ -134,10 +441,18 @@ int ucd_isspace(codepoint_t c)
 	{
 	case UCD_CATEGORY_Zl:
 	case UCD_CATEGORY_Zp:
 		return 1;
 	case UCD_CATEGORY_Zs:
 		switch (c) // Exclude characters with the <noBreak> DispositionType
 		{
 		case 0x00A0: // U+00A0 : NO-BREAK SPACE
 		case 0x2007: // U+2007 : FIGURE SPACE
 		case 0x202F: // U+202F : NARROW NO-BREAK SPACE
 			return 0;
 		}
 		return 1;
 	case UCD_CATEGORY_Cc:
 		switch (c) // Some control characters are also whitespace characters:
 		switch (c) // Include control characters marked as White_Space
 		{
 		case 0x09: // U+0009 : CHARACTER TABULATION
 		case 0x0A: // U+000A : LINE FEED
@@ -154,5 +469,27 @@ int ucd_isspace(codepoint_t c)

 int ucd_isupper(codepoint_t c)
 {
 	return ucd_lookup_category(c) == UCD_CATEGORY_Lu;
 	switch (ucd_lookup_category(c))
 	{
 	case UCD_CATEGORY_Lu:
 		return 1;
 	case UCD_CATEGORY_Lt:
 		return ucd_tolower(c) != c;
 	case UCD_CATEGORY_Nl:
 		return (c >= 0x002160 && c <= 0x00216F); // Other_Uppercase
 	case UCD_CATEGORY_So:
 		return (c >= 0x0024B6 && c <= 0x0024CF)  // Other_Uppercase
 		    || (c >= 0x01F130 && c <= 0x01F149)  // Other_Uppercase
 		    || (c >= 0x01F150 && c <= 0x01F169)  // Other_Uppercase
 		    || (c >= 0x01F170 && c <= 0x01F189); // Other_Uppercase
 	default:
 		return 0;
 	}
 }

 int ucd_isxdigit(codepoint_t c)
 {
 	return (c >= 0x30 && c <= 0x39)  // [0-9]
 	    || (c >= 0x41 && c <= 0x46)  // [A-Z]
 	    || (c >= 0x61 && c <= 0x66); // [a-z]
 }
--- a/src/ucd-tools/src/include/ucd/ucd.h
+++ b/src/ucd-tools/src/include/ucd/ucd.h
@@ -1,6 +1,6 @@
 /* Unicode Character Database API
 *
 * Copyright (C) 2012-2016 Reece H. Dunn
 * Copyright (C) 2012-2017 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
@@ -325,76 +325,90 @@ const char *ucd_get_script_string(ucd_script s);
  */
 ucd_script ucd_lookup_script(codepoint_t c);

 /** @brief Is the codepoint an alpha-numeric character?
 /** @brief Is the codepoint in the 'alnum' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a letter or number, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
  */
 int ucd_isalnum(codepoint_t c);

 /** @brief Is the codepoint a letter?
 /** @brief Is the codepoint in the 'alpha' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a letter, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
  */
 int ucd_isalpha(codepoint_t c);

 /** @brief Is the codepoint a control character?
 /** @brief Is the codepoint in the 'blank' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a control character, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'blank' class, zero otherwise.
  */
 int ucd_isblank(codepoint_t c);

 /** @brief Is the codepoint in the 'cntrl' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
  */
 int ucd_iscntrl(codepoint_t c);

 /** @brief Is the codepoint a numeric character?
 /** @brief Is the codepoint in the 'digit' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a number, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'digit' class, zero otherwise.
  */
 int ucd_isdigit(codepoint_t c);

 /** @brief Does the codepoint have a displayable glyph?
 /** @brief Is the codepoint in the 'graph' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint has a displayable glyph, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'graph' class, zero otherwise.
  */
 int ucd_isgraph(codepoint_t c);

 /** @brief Is the codepoint a lower-case letter?
 /** @brief Is the codepoint in the 'lower' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a lower-case letter, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'lower' class, zero otherwise.
  */
 int ucd_islower(codepoint_t c);

 /** @brief Is the codepoint a printable character?
 /** @brief Is the codepoint in the 'print' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a printable character, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'print' class, zero otherwise.
  */
 int ucd_isprint(codepoint_t c);

 /** @brief Is the codepoint a punctuation character?
 /** @brief Is the codepoint in the 'punct' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a punctuation character, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'punct' class, zero otherwise.
  */
 int ucd_ispunct(codepoint_t c);

 /** @brief Is the codepoint a whitespace character?
 /** @brief Is the codepoint in the 'space' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is a whitespace character, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'space' class, zero otherwise.
  */
 int ucd_isspace(codepoint_t c);

 /** @brief Is the codepoint an upper-case letter?
 /** @brief Is the codepoint in the 'upper' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is an upper-case letter, zero otherwise.
  * @return  Non-zero if the codepoint is in the 'upper' class, zero otherwise.
  */
 int ucd_isupper(codepoint_t c);

 /** @brief Is the codepoint in the 'xdigit' class?
  *
  * @param c The Unicode codepoint to check.
  * @return  Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
  */
 int ucd_isxdigit(codepoint_t c);

 /** @brief Convert the Unicode codepoint to upper-case.
  *
  * This function only uses the simple case mapping present in the
@@ -756,106 +770,126 @@ namespace ucd
 		return (script)ucd_lookup_script(c);
 	}

 	/** @brief Is the codepoint an alpha-numeric character?
 	/** @brief Is the codepoint in the 'alnum' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a letter or number, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
 	  */
 	inline int isalnum(codepoint_t c)
 	{
 		return ucd_isalnum(c);
 	}

 	/** @brief Is the codepoint a letter?
 	/** @brief Is the codepoint in the 'alpha' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a letter, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
 	  */
 	inline int isalpha(codepoint_t c)
 	{
 		return ucd_isalpha(c);
 	}

 	/** @brief Is the codepoint a control character?
 	/** @brief Is the codepoint in the 'blank' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is in the 'blank' class, zero otherwise.
 	  */
 	inline int isblank(codepoint_t c)
 	{
 		return ucd_isblank(c);
 	}

 	/** @brief Is the codepoint in the 'cntrl' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a control character, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
 	  */
 	inline int iscntrl(codepoint_t c)
 	{
 		return ucd_iscntrl(c);
 	}

 	/** @brief Is the codepoint a numeric character?
 	/** @brief Is the codepoint in the 'digit' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a number, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'digit' class, zero otherwise.
 	  */
 	inline int isdigit(codepoint_t c)
 	{
 		return ucd_isdigit(c);
 	}

 	/** @brief Does the codepoint have a displayable glyph?
 	/** @brief Is the codepoint in the 'graph' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint has a displayable glyph, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'graph' class, zero otherwise.
 	  */
 	inline int isgraph(codepoint_t c)
 	{
 		return ucd_isgraph(c);
 	}

 	/** @brief Is the codepoint a lower-case letter?
 	/** @brief Is the codepoint in the 'lower' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a lower-case letter, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'lower' class, zero otherwise.
 	  */
 	inline int islower(codepoint_t c)
 	{
 		return ucd_islower(c);
 	}

 	/** @brief Is the codepoint a printable character?
 	/** @brief Is the codepoint in the 'print' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a printable character, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'print' class, zero otherwise.
 	  */
 	inline int isprint(codepoint_t c)
 	{
 		return ucd_isprint(c);
 	}

 	/** @brief Is the codepoint a punctuation character?
 	/** @brief Is the codepoint in the 'punct' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a punctuation character, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'punct' class, zero otherwise.
 	  */
 	inline int ispunct(codepoint_t c)
 	{
 		return ucd_ispunct(c);
 	}

 	/** @brief Is the codepoint a whitespace character?
 	/** @brief Is the codepoint in the 'space' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is a whitespace character, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'space' class, zero otherwise.
 	  */
 	inline int isspace(codepoint_t c)
 	{
 		return ucd_isspace(c);
 	}

 	/** @brief Is the codepoint an upper-case letter?
 	/** @brief Is the codepoint in the 'upper' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is an upper-case letter, zero otherwise.
 	  * @return  Non-zero if the codepoint is in the 'upper' class, zero otherwise.
 	  */
 	inline int isupper(codepoint_t c)
 	{
 		return ucd_isupper(c);
 	}

 	/** @brief Is the codepoint in the 'xdigit' class?
 	  *
 	  * @param c The Unicode codepoint to check.
 	  * @return  Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
 	  */
 	inline int isxdigit(codepoint_t c)
 	{
 		return ucd_isxdigit(c);
 	}

 	/** @brief Convert the Unicode codepoint to upper-case.
 	  *
 	  * This function only uses the simple case mapping present in the
--- a/src/ucd-tools/tests/printcdata.c
+++ b/src/ucd-tools/tests/printcdata.c
@@ -0,0 +1,255 @@
 /*
 * Copyright (C) 2012-2017 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */

 #include "ucd/ucd.h"

 #include <locale.h>
 #include <string.h>
 #include <stdio.h>
 #include <wchar.h>
 #include <wctype.h>

 void fput_utf8c(FILE *out, codepoint_t c)
 {
 	if (c < 0x80)
 		fputc((uint8_t)c, out);
 	else if (c < 0x800)
 	{
 		fputc(0xC0 | (c >> 6), out);
 		fputc(0x80 + (c & 0x3F), out);
 	}
 	else if (c < 0x10000)
 	{
 		fputc(0xE0 | (c >> 12), out);
 		fputc(0x80 + ((c >> 6) & 0x3F), out);
 		fputc(0x80 + (c & 0x3F), out);
 	}
 	else if (c < 0x200000)
 	{
 		fputc(0xF0 | (c >> 18), out);
 		fputc(0x80 + ((c >> 12) & 0x3F), out);
 		fputc(0x80 + ((c >>  6) & 0x3F), out);
 		fputc(0x80 + (c & 0x3F), out);
 	}
 }

 int fget_utf8c(FILE *in, codepoint_t *c)
 {
 	int ch = EOF;
 	if ((ch = fgetc(in)) == EOF) return 0;
 	if ((uint8_t)ch < 0x80)
 		*c = (uint8_t)ch;
 	else switch ((uint8_t)ch & 0xF0)
 	{
 	default:
 		*c = (uint8_t)ch & 0x1F;
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		break;
 	case 0xE0:
 		*c = (uint8_t)ch & 0x0F;
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		break;
 	case 0xF0:
 		*c = (uint8_t)ch & 0x07;
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		if ((ch = fgetc(in)) == EOF) return 0;
 		*c = (*c << 6) + ((uint8_t)ch & 0x3F);
 		break;
 	}
 	return 1;
 }

 void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
 {
 	switch (mode)
 	{
 	case 'c': // character
 		switch (c)
 		{
 		case '\t': fputs("\\t", out);  break;
 		case '\r': fputs("\\r", out);  break;
 		case '\n': fputs("\\n", out);  break;
 		default:   fput_utf8c(out, c); break;
 		}
 		break;
 	case 'h': // hexadecimal (lower)
 		fprintf(out, "%06x", c);
 		break;
 	case 'H': // hexadecimal (upper)
 		fprintf(out, "%06X", c);
 		break;
 	}
 }

 void uprintf_is(FILE *out, codepoint_t c, char mode)
 {
 	switch (mode)
 	{
 	case 'A': // alpha-numeric
 		fputc(iswalnum(c) ? '1' : '0', out);
 		break;
 	case 'a': // alpha
 		fputc(iswalpha(c) ? '1' : '0', out);
 		break;
 	case 'b': // blank
 		fputc(iswblank(c) ? '1' : '0', out);
 		break;
 	case 'c': // control
 		fputc(iswcntrl(c) ? '1' : '0', out);
 		break;
 	case 'd': // numeric
 		fputc(iswdigit(c) ? '1' : '0', out);
 		break;
 	case 'g': // glyph
 		fputc(iswgraph(c) ? '1' : '0', out);
 		break;
 	case 'l': // lower case
 		fputc(iswlower(c) ? '1' : '0', out);
 		break;
 	case 'P': // printable
 		fputc(iswprint(c) ? '1' : '0', out);
 		break;
 	case 'p': // punctuation
 		fputc(iswpunct(c) ? '1' : '0', out);
 		break;
 	case 's': // whitespace
 		fputc(iswspace(c) ? '1' : '0', out);
 		break;
 	case 'u': // upper case
 		fputc(iswupper(c) ? '1' : '0', out);
 		break;
 	case 'x': // xdigit
 		fputc(iswxdigit(c) ? '1' : '0', out);
 		break;
 	}
 }

 void uprintf(FILE *out, codepoint_t c, const char *format)
 {
 	while (*format) switch (*format)
 	{
 	case '%':
 		switch (*++format)
 		{
 		case 'c': // category
 			fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
 			break;
 		case 'C': // category group
 			fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
 			break;
 		case 'p': // codepoint
 			uprintf_codepoint(out, c, *++format);
 			break;
 		case 'i': // is*
 			uprintf_is(out, c, *++format);
 			break;
 		case 'L': // lowercase
 			uprintf_codepoint(out, towlower(c), *++format);
 			break;
 		case 's': // script
 			fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
 			break;
 		case 'T': // titlecase
 			uprintf_codepoint(out, ucd_totitle(c), *++format);
 			break;
 		case 'U': // uppercase
 			uprintf_codepoint(out, towupper(c), *++format);
 			break;
 		}
 		++format;
 		break;
 	case '\\':
 		switch (*++format) {
 		case 0:
 			break;
 		case 't':
 			fputc('\t', out);
 			++format;
 			break;
 		case 'r':
 			fputc('\r', out);
 			++format;
 			break;
 		case 'n':
 			fputc('\n', out);
 			++format;
 			break;
 		default:
 			fputc(*format, out);
 			++format;
 			break;
 		}
 		break;
 	default:
 		fputc(*format, out);
 		++format;
 		break;
 	}
 }

 void print_file(FILE *in, const char *format)
 {
 	codepoint_t c = 0;
 	while (fget_utf8c(in, &c))
 		uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
 }

 int main(int argc, char **argv)
 {
 	FILE *in = NULL;
 	const char *format = NULL;
 	for (int argn = 1; argn != argc; ++argn)
 	{
 		const char *arg = argv[argn];
 		if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
 			in = stdin;
 		else if (!strncmp(arg, "--format=", 9))
 			format = arg + 9;
 		else if (!strncmp(arg, "--locale=", 9))
 			setlocale(LC_CTYPE, arg + 9);
 		else if (in == NULL)
 		{
 			in = fopen(arg, "r");
 			if (!in)
 				fprintf(stdout, "cannot open `%s`\n", argv[1]);
 		}
 	}

 	if (in == stdin)
 		print_file(stdin, format);
 	else if (in != NULL)
 	{
 		print_file(in, format);
 		fclose(in);
 	}
 	else
 	{
 		for (codepoint_t c = 0; c <= 0x10FFFF; ++c)
 			uprintf(stdout, c, format ? format :
 			        "%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n");
 	}
 	return 0;
 }
--- a/src/ucd-tools/tests/printucddata.c
+++ b/src/ucd-tools/tests/printucddata.c
@@ -1,5 +1,5 @@
 /*
 * Copyright (C) 2012-2016 Reece H. Dunn
 * Copyright (C) 2012-2017 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
@@ -101,6 +101,49 @@ void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
 	}
 }

 void uprintf_is(FILE *out, codepoint_t c, char mode)
 {
 	switch (mode)
 	{
 	case 'A': // alpha-numeric
 		fputc(ucd_isalnum(c) ? '1' : '0', out);
 		break;
 	case 'a': // alpha
 		fputc(ucd_isalpha(c) ? '1' : '0', out);
 		break;
 	case 'b': // blank
 		fputc(ucd_isblank(c) ? '1' : '0', out);
 		break;
 	case 'c': // control
 		fputc(ucd_iscntrl(c) ? '1' : '0', out);
 		break;
 	case 'd': // numeric
 		fputc(ucd_isdigit(c) ? '1' : '0', out);
 		break;
 	case 'g': // glyph
 		fputc(ucd_isgraph(c) ? '1' : '0', out);
 		break;
 	case 'l': // lower case
 		fputc(ucd_islower(c) ? '1' : '0', out);
 		break;
 	case 'P': // printable
 		fputc(ucd_isprint(c) ? '1' : '0', out);
 		break;
 	case 'p': // punctuation
 		fputc(ucd_ispunct(c) ? '1' : '0', out);
 		break;
 	case 's': // whitespace
 		fputc(ucd_isspace(c) ? '1' : '0', out);
 		break;
 	case 'u': // upper case
 		fputc(ucd_isupper(c) ? '1' : '0', out);
 		break;
 	case 'x': // xdigit
 		fputc(ucd_isxdigit(c) ? '1' : '0', out);
 		break;
 	}
 }

 void uprintf(FILE *out, codepoint_t c, const char *format)
 {
 	while (*format) switch (*format)
@@ -117,6 +160,9 @@ void uprintf(FILE *out, codepoint_t c, const char *format)
 		case 'p': // codepoint
 			uprintf_codepoint(out, c, *++format);
 			break;
 		case 'i': // is*
 			uprintf_is(out, c, *++format);
 			break;
 		case 'L': // lowercase
 			uprintf_codepoint(out, ucd_tolower(c), *++format);
 			break;
@@ -129,13 +175,31 @@ void uprintf(FILE *out, codepoint_t c, const char *format)
 		case 'U': // uppercase
 			uprintf_codepoint(out, ucd_toupper(c), *++format);
 			break;
 		case 'W': // whitespace
 			if (ucd_isspace(c))
 				fputs("White_Space", out);
 			break;
 		}
 		++format;
 		break;
 	case '\\':
 		switch (*++format) {
 		case 0:
 			break;
 		case 't':
 			fputc('\t', out);
 			++format;
 			break;
 		case 'r':
 			fputc('\r', out);
 			++format;
 			break;
 		case 'n':
 			fputc('\n', out);
 			++format;
 			break;
 		default:
 			fputc(*format, out);
 			++format;
 			break;
 		}
 		break;
 	default:
 		fputc(*format, out);
 		++format;
@@ -143,35 +207,44 @@ void uprintf(FILE *out, codepoint_t c, const char *format)
 	}
 }

 void print_file(FILE *in)
 void print_file(FILE *in, const char *format)
 {
 	codepoint_t c = 0;
 	while (fget_utf8c(in, &c))
 		uprintf(stdout, c, "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n");
 		uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
 }

 int main(int argc, char **argv)
 {
 	if (argc == 2)
 	FILE *in = NULL;
 	const char *format = NULL;
 	for (int argn = 1; argn != argc; ++argn)
 	{
 		if (!strcmp(argv[1], "--stdin") || !strcmp(argv[1], "-"))
 			print_file(stdin);
 		else
 		const char *arg = argv[argn];
 		if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
 			in = stdin;
 		else if (!strncmp(arg, "--format=", 9))
 			format = arg + 9;
 		else if (in == NULL)
 		{
 			FILE *in = fopen(argv[1], "r");
 			if (in)
 			{
 				print_file(in);
 				fclose(in);
 			}
 			else
 			in = fopen(arg, "r");
 			if (!in)
 				fprintf(stdout, "cannot open `%s`\n", argv[1]);
 		}
 	}

 	if (in == stdin)
 		print_file(stdin, format);
 	else if (in != NULL)
 	{
 		print_file(in, format);
 		fclose(in);
 	}
 	else
 	{
 		for (codepoint_t c = 0; c <= 0x10FFFF; ++c)
 			uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n");
 			uprintf(stdout, c, format ? format :
 			        "%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n");
 	}
 	return 0;
 }
--- a/src/ucd-tools/tests/printucddata_cpp.cpp
+++ b/src/ucd-tools/tests/printucddata_cpp.cpp
@@ -1,5 +1,5 @@
 /*
 * Copyright (C) 2012-2016 Reece H. Dunn
 * Copyright (C) 2012-2017 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
@@ -101,6 +101,49 @@ void uprintf_codepoint(FILE *out, ucd::codepoint_t c, char mode)
 	}
 }

 void uprintf_is(FILE *out, ucd::codepoint_t c, char mode)
 {
 	switch (mode)
 	{
 	case 'A': // alpha-numeric
 		fputc(ucd::isalnum(c) ? '1' : '0', out);
 		break;
 	case 'a': // alpha
 		fputc(ucd::isalpha(c) ? '1' : '0', out);
 		break;
 	case 'b': // blank
 		fputc(ucd::isblank(c) ? '1' : '0', out);
 		break;
 	case 'c': // control
 		fputc(ucd::iscntrl(c) ? '1' : '0', out);
 		break;
 	case 'd': // numeric
 		fputc(ucd::isdigit(c) ? '1' : '0', out);
 		break;
 	case 'g': // glyph
 		fputc(ucd::isgraph(c) ? '1' : '0', out);
 		break;
 	case 'l': // lower case
 		fputc(ucd::islower(c) ? '1' : '0', out);
 		break;
 	case 'P': // printable
 		fputc(ucd::isprint(c) ? '1' : '0', out);
 		break;
 	case 'p': // punctuation
 		fputc(ucd::ispunct(c) ? '1' : '0', out);
 		break;
 	case 's': // whitespace
 		fputc(ucd::isspace(c) ? '1' : '0', out);
 		break;
 	case 'u': // upper case
 		fputc(ucd::isupper(c) ? '1' : '0', out);
 		break;
 	case 'x': // xdigit
 		fputc(ucd::isxdigit(c) ? '1' : '0', out);
 		break;
 	}
 }

 void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
 {
 	while (*format) switch (*format)
@@ -117,6 +160,9 @@ void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
 		case 'p': // codepoint
 			uprintf_codepoint(out, c, *++format);
 			break;
 		case 'i': // is*
 			uprintf_is(out, c, *++format);
 			break;
 		case 'L': // lowercase
 			uprintf_codepoint(out, ucd::tolower(c), *++format);
 			break;
@@ -129,13 +175,31 @@ void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
 		case 'U': // uppercase
 			uprintf_codepoint(out, ucd::toupper(c), *++format);
 			break;
 		case 'W': // whitespace
 			if (ucd::isspace(c))
 				fputs("White_Space", out);
 			break;
 		}
 		++format;
 		break;
 	case '\\':
 		switch (*++format) {
 		case 0:
 			break;
 		case 't':
 			fputc('\t', out);
 			++format;
 			break;
 		case 'r':
 			fputc('\r', out);
 			++format;
 			break;
 		case 'n':
 			fputc('\n', out);
 			++format;
 			break;
 		default:
 			fputc(*format, out);
 			++format;
 			break;
 		}
 		break;
 	default:
 		fputc(*format, out);
 		++format;
@@ -143,35 +207,44 @@ void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
 	}
 }

 void print_file(FILE *in)
 void print_file(FILE *in, const char *format)
 {
 	ucd::codepoint_t c = 0;
 	while (fget_utf8c(in, c))
 		uprintf(stdout, c, "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n");
 		uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
 }

 int main(int argc, char **argv)
 {
 	if (argc == 2)
 	FILE *in = NULL;
 	const char *format = NULL;
 	for (int argn = 1; argn != argc; ++argn)
 	{
 		if (!strcmp(argv[1], "--stdin") || !strcmp(argv[1], "-"))
 			print_file(stdin);
 		else
 		const char *arg = argv[argn];
 		if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
 			in = stdin;
 		else if (!strncmp(arg, "--format=", 9))
 			format = arg + 9;
 		else if (in == NULL)
 		{
 			FILE *in = fopen(argv[1], "r");
 			if (in)
 			{
 				print_file(in);
 				fclose(in);
 			}
 			else
 			in = fopen(arg, "r");
 			if (!in)
 				fprintf(stdout, "cannot open `%s`\n", argv[1]);
 		}
 	}

 	if (in == stdin)
 		print_file(stdin, format);
 	else if (in != NULL)
 	{
 		print_file(in, format);
 		fclose(in);
 	}
 	else
 	{
 		for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
 			uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n");
 			uprintf(stdout, c, format ? format :
 			        "%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il\n");
 	}
 	return 0;
 }
--- a/src/ucd-tools/tools/printdata.py
+++ b/src/ucd-tools/tools/printdata.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python

 # Copyright (C) 2012 Reece H. Dunn
 # Copyright (C) 2012-2017 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
@@ -24,15 +24,20 @@ import ucd
 ucd_rootdir = sys.argv[1]
 csur_rootdir = 'data/csur'

 null = ucd.CodePoint('0000')

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	for codepoint in data['CodePoint']:
 		unicode_chars[codepoint] = data
 		unicode_chars[codepoint]['Properties'] = []
 for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
 	if data['Property'] in ['White_Space']:
 for propfile in ['PropList', 'DerivedCoreProperties']:
 	for data in ucd.parse_ucd_data(ucd_rootdir, propfile):
 		for codepoint in data['Range']:
 			unicode_chars[codepoint]['Properties'].append(data['Property'])
 			try:
 				unicode_chars[codepoint][data['Property']] = 1
 			except KeyError:
 				unicode_chars[codepoint] = {'CodePoint': codepoint}
 				unicode_chars[codepoint][data['Property']] = 1
 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
 	for codepoint in data['Range']:
 		unicode_chars[codepoint]['Script'] = data['Script']
@@ -40,31 +45,91 @@ if '--with-csur' in sys.argv:
 	for csur in ['Klingon']:
 		for data in ucd.parse_ucd_data('data/csur', csur):
 			for codepoint in data['CodePoint']:
 				if not 'TitleCase'  in data: data['TitleCase']  = codepoint
 				if not 'UpperCase'  in data: data['UpperCase']  = codepoint
 				if not 'LowerCase'  in data: data['LowerCase']  = codepoint
 				if not 'Properties' in data: data['Properties'] = []
 				unicode_chars[codepoint] = data

 null = ucd.CodePoint('0000')
 def iscntrl(data):
 	return 1 if data.get('Name', '') == '<control>' else 0

 def isdigit(data):
 	return 1 if data['CodePoint'].char() in '0123456789' else 0

 def isxdigit(data):
 	return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0

 def isspace(data):
 	if data.get('White_Space', 0):
 		dt = data.get('DecompositionType', '')
 		return 1 if dt == None or not dt.startswith('<noBreak>') else 0
 	else:
 		return 0

 def isblank(data): # word separator
 	if data.get('GeneralCategory', 'Cn') == 'Zs' or data['CodePoint'].char() == '\t':
 		dt = data.get('DecompositionType', '')
 		return 1 if dt == None or not dt.startswith('<noBreak>') else 0
 	else:
 		return 0

 def ispunct(data):
 	if data.get('GeneralCategory', 'Cn')[0] in 'P':
 		return 1
 	else:
 		return 0

 def isprint(data):
 	if data.get('GeneralCategory', 'Cn')[0] in 'LMNPSZ': # not in 'CI'
 		return 1
 	else:
 		return 0

 def isgraph(data):
 	if data.get('GeneralCategory', 'Cn')[0] in 'LMNPS': # not in 'CZI'
 		return 1
 	else:
 		return 0

 def isalnum(data):
 	if data.get('GeneralCategory', 'Cn')[0] in 'N':
 		return 1
 	else:
 		return data.get('Alphabetic', 0)

 def isalpha(data):
 	return data.get('Alphabetic', 0)

 def isupper(data):
 	if data.get('Uppercase', 0):
 		return 1
 	elif data.get('LowerCase', null) != null: # Some Lt characters have lowercase forms.
 		return 1
 	else:
 		return 0

 def islower(data):
 	if data.get('Lowercase', 0):
 		return 1
 	elif data.get('UpperCase', null) != null:
 		return 1
 	else:
 		return 0

 if __name__ == '__main__':
 	for codepoint in ucd.CodeRange('000000..10FFFF'):
 		try:
 			data = unicode_chars[codepoint]
 		except KeyError:
 			data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []}
 		try:
 			script = data['Script']
 		except KeyError:
 			script = 'Zzzz'
 		title = data['TitleCase']
 		upper = data['UpperCase']
 		lower = data['LowerCase']
 			data = {'CodePoint': codepoint}
 		script = data.get('Script', 'Zzzz')
 		title = data.get('TitleCase', codepoint)
 		upper = data.get('UpperCase', codepoint)
 		lower = data.get('LowerCase', codepoint)
 		if title == null: title = codepoint
 		if upper == null: upper = codepoint
 		if lower == null: lower = codepoint
 		print('%s %s %s %s %s %s %s %s' % (
 		print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (
 		      codepoint, script,
 		      data['GeneralCategory'][0], data['GeneralCategory'],
 		      data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'),
 		      upper, lower, title,
 		      ' '.join(data['Properties'])))
 		      isdigit(data), isxdigit(data),
 		      iscntrl(data), isspace(data), isblank(data), ispunct(data),
 		      isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data)))
--- a/src/ucd-tools/tools/ucd.py
+++ b/src/ucd-tools/tools/ucd.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python

 # Copyright (C) 2012-2014 Reece H. Dunn
 # Copyright (C) 2012-2017 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
@@ -50,6 +50,9 @@ class CodePoint:
 	def __lt__(self, other):
 		return self.codepoint < other.codepoint

 	def char(self):
 		return unichr(self.codepoint)

 class CodeRange:
 	def __init__(self, x):
 		f, l = x.split('..')
@@ -69,6 +72,9 @@ class CodeRange:
 	def size(self):
 		return self.last.codepoint - self.first.codepoint + 1

 	def char(self):
 		return unichr(self.first.codepoint)

 def codepoint(x):
 	if '..' in x[0]:
 		return CodeRange(x[0]), x[1:]
@@ -107,6 +113,10 @@ data_items = {
 		('Range', codepoint),
 		('Age', string),
 	],
 	'DerivedCoreProperties': [
 		('Range', codepoint),
 		('Property', string),
 	],
 	'PropList': [
 		('Range', codepoint),
 		('Property', string),