Browse Source

Merge commit 'f26aca0aa1b9485634314d0d1710d2a0bb67776d'

master
Reece H. Dunn 8 years ago
parent
commit
8566f2ee35

+ 1
- 0
src/ucd-tools/.gitignore View File

@@ -1,4 +1,5 @@
.*.swp
*~

# intermediate files:


+ 5
- 2
src/ucd-tools/CHANGELOG.md View File

@@ -7,11 +7,14 @@ These are eSpeak NG specific modifications to the `ucd-tools` project:
* `data/espeak-ng` data files for eSpeak NG extended data.
* espeak-ng PropList property lookup as part of the `ucd_property` API.

## 9.0.0.1 - (In Progress)
## 10.0.0 - 2017-06-25

* Add `iswblank` and `iswxdigit` compatibility.
* Improve ctype compatibility.
* PropList property lookup.
* PropList and emoji-data property lookup.
* Support building with a C89 compiler.
* Update to Unicode Character Data 10.0.0.
* Unicode Emoji 5.0.

## 9.0.0 - 2016-12-28


+ 7
- 7
src/ucd-tools/Makefile.am View File

@@ -55,34 +55,34 @@ EXTRA_DIST += ChangeLog

############################# Unicode Data ####################################

EMOJI_VERSION=4.0
EMOJI_VERSION=5.0
UCD_VERSION=@UCD_VERSION@
UCD_ROOTDIR=data/ucd
UCD_SRCDIR=http://www.unicode.org/Public

data/emoji/emoji-data.txt:
mkdir -pv data/emoji
curl ${UCD_SRCDIR}/emoji/${EMOJI_VERSION}/emoji-data.txt > $@
curl ${UCD_SRCDIR}/emoji/${EMOJI_VERSION}/emoji-data.txt -o $@

data/ucd/PropList.txt:
mkdir -pv data/ucd
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropList.txt > $@
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropList.txt -o $@

data/ucd/DerivedCoreProperties.txt:
mkdir -pv data/ucd
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/DerivedCoreProperties.txt > $@
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/DerivedCoreProperties.txt -o $@

data/ucd/PropertyValueAliases.txt:
mkdir -pv data/ucd
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropertyValueAliases.txt > $@
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/PropertyValueAliases.txt -o $@

data/ucd/Scripts.txt:
mkdir -pv data/ucd
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/Scripts.txt > $@
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/Scripts.txt -o $@

data/ucd/UnicodeData.txt:
mkdir -pv data/ucd
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/UnicodeData.txt > $@
curl ${UCD_SRCDIR}/${UCD_VERSION}/ucd/UnicodeData.txt -o $@

############################# documentation ###################################


+ 1
- 0
src/ucd-tools/configure.ac View File

@@ -24,6 +24,7 @@ dnl library checks.
dnl ================================================================

AC_CHECK_HEADERS([stddef.h]) dnl C89
AC_CHECK_FUNCS([iswblank]) dnl C99

AC_TYPE_UINT8_T
AC_TYPE_UINT32_T

+ 4
- 3
src/ucd-tools/src/case.c View File

@@ -18,14 +18,15 @@
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

// NOTE: This file is automatically generated from the UnicodeData.txt file in
// the Unicode Character database by the ucd-tools/tools/categories.py script.
/* NOTE: This file is automatically generated from the UnicodeData.txt file in
* the Unicode Character database by the ucd-tools/tools/categories.py script.
*/

#include "ucd/ucd.h"

#include <stddef.h>

// Unicode Character Data 9.0.0
/* Unicode Character Data 9.0.0 */

struct case_conversion_entry
{

+ 740
- 679
src/ucd-tools/src/categories.c
File diff suppressed because it is too large
View File


+ 20
- 20
src/ucd-tools/src/ctype.c View File

@@ -69,16 +69,16 @@ int ucd_isblank(codepoint_t c)
switch (ucd_lookup_category(c))
{
case UCD_CATEGORY_Zs:
switch (c) // Exclude characters with the <noBreak> DispositionType
switch (c) /* Exclude characters with the <noBreak> DispositionType */
{
case 0x00A0: // U+00A0 : NO-BREAK SPACE
case 0x2007: // U+2007 : FIGURE SPACE
case 0x202F: // U+202F : NARROW NO-BREAK SPACE
case 0x00A0: /* U+00A0 : NO-BREAK SPACE */
case 0x2007: /* U+2007 : FIGURE SPACE */
case 0x202F: /* U+202F : NARROW NO-BREAK SPACE */
return 0;
}
return 1;
case UCD_CATEGORY_Cc:
return c == 0x09; // U+0009 : CHARACTER TABULATION
return c == 0x09; /* U+0009 : CHARACTER TABULATION */
default:
return 0;
}
@@ -91,7 +91,7 @@ int ucd_iscntrl(codepoint_t c)

int ucd_isdigit(codepoint_t c)
{
return (c >= 0x30 && c <= 0x39); // [0-9]
return (c >= 0x30 && c <= 0x39); /* [0-9] */
}

int ucd_isgraph(codepoint_t c)
@@ -174,23 +174,23 @@ int ucd_isspace(codepoint_t c)
case UCD_CATEGORY_Zp:
return 1;
case UCD_CATEGORY_Zs:
switch (c) // Exclude characters with the <noBreak> DispositionType
switch (c) /* Exclude characters with the <noBreak> DispositionType */
{
case 0x00A0: // U+00A0 : NO-BREAK SPACE
case 0x2007: // U+2007 : FIGURE SPACE
case 0x202F: // U+202F : NARROW NO-BREAK SPACE
case 0x00A0: /* U+00A0 : NO-BREAK SPACE */
case 0x2007: /* U+2007 : FIGURE SPACE */
case 0x202F: /* U+202F : NARROW NO-BREAK SPACE */
return 0;
}
return 1;
case UCD_CATEGORY_Cc:
switch (c) // Include control characters marked as White_Space
switch (c) /* Include control characters marked as White_Space */
{
case 0x09: // U+0009 : CHARACTER TABULATION
case 0x0A: // U+000A : LINE FEED
case 0x0B: // U+000B : LINE TABULATION
case 0x0C: // U+000C : FORM FEED
case 0x0D: // U+000D : CARRIAGE RETURN
case 0x85: // U+0085 : NEXT LINE
case 0x09: /* U+0009 : CHARACTER TABULATION */
case 0x0A: /* U+000A : LINE FEED */
case 0x0B: /* U+000B : LINE TABULATION */
case 0x0C: /* U+000C : FORM FEED */
case 0x0D: /* U+000D : CARRIAGE RETURN */
case 0x85: /* U+0085 : NEXT LINE */
return 1;
}
default:
@@ -217,7 +217,7 @@ int ucd_isupper(codepoint_t c)

int ucd_isxdigit(codepoint_t c)
{
return (c >= 0x30 && c <= 0x39) // [0-9]
|| (c >= 0x41 && c <= 0x46) // [A-Z]
|| (c >= 0x61 && c <= 0x66); // [a-z]
return (c >= 0x30 && c <= 0x39) /* [0-9] */
|| (c >= 0x41 && c <= 0x46) /* [A-Z] */
|| (c >= 0x61 && c <= 0x66); /* [a-z] */
}

+ 10
- 0
src/ucd-tools/src/include/ucd/ucd.h View File

@@ -176,6 +176,7 @@ typedef enum ucd_script_
UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
UCD_SCRIPT_Goth, /**< @brief Gothic Script */
UCD_SCRIPT_Gran, /**< @brief Grantha Script */
UCD_SCRIPT_Grek, /**< @brief Greek Script */
@@ -273,6 +274,7 @@ typedef enum ucd_script_
UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
UCD_SCRIPT_Soyo, /**< @brief Soyombo */
UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
@@ -302,6 +304,7 @@ typedef enum ucd_script_
UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
UCD_SCRIPT_Yiii, /**< @brief Yi Script */
UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
UCD_SCRIPT_Zsym, /**< @brief Symbols */
@@ -366,6 +369,8 @@ typedef uint64_t ucd_property;
#define UCD_PROPERTY_EMOJI_PRESENTATION 0x0000000400000000ull /**< @brief Emoji_Presentation */
#define UCD_PROPERTY_EMOJI_MODIFIER 0x0000000800000000ull /**< @brief Emoji_Modifier */
#define UCD_PROPERTY_EMOJI_MODIFIER_BASE 0x0000001000000000ull /**< @brief Emoji_Modifier_Base */
#define UCD_PROPERTY_REGIONAL_INDICATOR 0x0000002000000000ull /**< @brief Regional_Indicator */
#define UCD_PROPERTY_EMOJI_COMPONENT 0x0000004000000000ull /**< @brief Emoji_Component */

// eSpeak NG extended properties:
#define ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION 0x0010000000000000ull /**< @brief Inverted_Terminal_Punctuation */
@@ -679,6 +684,7 @@ namespace ucd
Geok = UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
Geor = UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
Glag = UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
Gonm = UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
Goth = UCD_SCRIPT_Goth, /**< @brief Gothic Script */
Gran = UCD_SCRIPT_Gran, /**< @brief Grantha Script */
Grek = UCD_SCRIPT_Grek, /**< @brief Greek Script */
@@ -776,6 +782,7 @@ namespace ucd
Sind = UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
Sinh = UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
Sora = UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
Soyo = UCD_SCRIPT_Soyo, /**< @brief Soyombo */
Sund = UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
Sylo = UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
Syrc = UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
@@ -805,6 +812,7 @@ namespace ucd
Xpeo = UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
Xsux = UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
Yiii = UCD_SCRIPT_Yiii, /**< @brief Yi Script */
Zanb = UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
Zinh = UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
Zmth = UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
Zsym = UCD_SCRIPT_Zsym, /**< @brief Symbols */
@@ -876,6 +884,8 @@ namespace ucd
Emoji_Presentation = UCD_PROPERTY_EMOJI_PRESENTATION, /**< @brief Emoji_Presentation */
Emoji_Modifier = UCD_PROPERTY_EMOJI_MODIFIER, /**< @brief Emoji_Modifier */
Emoji_Modifier_Base = UCD_PROPERTY_EMOJI_MODIFIER_BASE, /**< @brief Emoji_Modifier_Base */
Regional_Indicator = UCD_PROPERTY_REGIONAL_INDICATOR, /**< @brief Regional_Indicator */
Emoji_Component = UCD_PROPERTY_EMOJI_COMPONENT, /**< @brief Emoji_Component */
};

/** @brief Return the properties of the specified codepoint.

+ 102
- 62
src/ucd-tools/src/proplist.c View File

@@ -78,9 +78,6 @@ static ucd_property properties_Cn(codepoint_t c)
case 0x2000:
if (c == 0x2065) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
break;
case 0x2300:
if (c == 0x23FF) return UCD_PROPERTY_PATTERN_SYNTAX;
break;
case 0x2400:
if (c >= 0x2427 && c <= 0x243F) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x244B && c <= 0x245F) return UCD_PROPERTY_PATTERN_SYNTAX;
@@ -90,7 +87,7 @@ static ucd_property properties_Cn(codepoint_t c)
if (c >= 0x2B96 && c <= 0x2B97) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x2BBA && c <= 0x2BBC) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x2BC9) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x2BD2 && c <= 0x2BEB) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x2BD3 && c <= 0x2BEB) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x2BF0 && c <= 0x2BFF) return UCD_PROPERTY_PATTERN_SYNTAX;
break;
case 0x2E00:
@@ -148,11 +145,11 @@ static ucd_property properties_Ll(codepoint_t c)
if (c == 0x029D) return UCD_PROPERTY_SOFT_DOTTED;
break;
case 0x0300:
if (c >= 0x03D0 && c <= 0x03D2) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x03D0 && c <= 0x03D2) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c == 0x03D5) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x03F0 && c <= 0x03F1) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x03F3) return UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x0400:
if (c == 0x0456) return UCD_PROPERTY_SOFT_DOTTED;
@@ -166,12 +163,12 @@ static ucd_property properties_Ll(codepoint_t c)
if (c == 0x1ECB) return UCD_PROPERTY_SOFT_DOTTED;
break;
case 0x2100:
if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x2133 && c <= 0x2134) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x2133 && c <= 0x2134) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c == 0x2139) return UCD_PROPERTY_EMOJI;
if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x2145 && c <= 0x2147) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x2145 && c <= 0x2147) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x2148 && c <= 0x2149) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
break;
case 0xFF00:
@@ -179,45 +176,45 @@ static ucd_property properties_Ll(codepoint_t c)
break;
case 0x01D400:
if (c >= 0x01D422 && c <= 0x01D423) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D456 && c <= 0x01D457) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D48A && c <= 0x01D48B) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D458 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D4AE && c <= 0x01D4B9) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D458 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D4AE && c <= 0x01D4B9) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c == 0x01D4BB) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D4BE && c <= 0x01D4BF) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D4BD && c <= 0x01D4C3) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D4F2 && c <= 0x01D4F3) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D4C5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D4C5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D500:
if (c <= 0x01D505) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c <= 0x01D505) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D526 && c <= 0x01D527) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D51E && c <= 0x01D539) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D51E && c <= 0x01D539) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D55A && c <= 0x01D55B) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D58E && c <= 0x01D58F) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D5C2 && c <= 0x01D5C3) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D5F6 && c <= 0x01D5F7) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D552) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D552) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D600:
if (c >= 0x01D62A && c <= 0x01D62B) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D65E && c <= 0x01D65F) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c >= 0x01D692 && c <= 0x01D693) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_SOFT_DOTTED;
if (c <= 0x01D6A5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c <= 0x01D6A5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D6C2 && c <= 0x01D6DA) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D6DC && c <= 0x01D6FA) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D6DC && c <= 0x01D6FA) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D6FC) return UCD_PROPERTY_OTHER_MATH;
break;
case 0x01D700:
if (c <= 0x01D714) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D716 && c <= 0x01D734) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D716 && c <= 0x01D734) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D736 && c <= 0x01D74E) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D750 && c <= 0x01D76E) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D750 && c <= 0x01D76E) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D770 && c <= 0x01D788) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D78A && c <= 0x01D7A8) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D78A && c <= 0x01D7A8) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D7AA && c <= 0x01D7C2) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D7C4 && c <= 0x01D7CB) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D7C4 && c <= 0x01D7CB) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
}
return 0;
@@ -332,7 +329,7 @@ static ucd_property properties_Lm(codepoint_t c)
break;
case 0x016F00:
if (c >= 0x016F93 && c <= 0x016F9F) return UCD_PROPERTY_DIACRITIC;
if (c == 0x016FE0) return UCD_PROPERTY_EXTENDER;
if (c >= 0x016FE0 && c <= 0x016FE1) return UCD_PROPERTY_EXTENDER;
break;
}
return 0;
@@ -407,19 +404,21 @@ static ucd_property properties_Lo_ideographic(codepoint_t c)
{
case 0x000000:
if (c >= 0x3400 && c <= 0x4DB5) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x4E00 && c <= 0x9FD5) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x4E00 && c <= 0x9FEA) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0xF900 && c <= 0xFA6D) return UCD_PROPERTY_IDEOGRAPHIC;
if (c >= 0xFA70 && c <= 0xFAD9) return UCD_PROPERTY_IDEOGRAPHIC;
break;
case 0x010000:
if (c >= 0x017000 && c <= 0x0187EC) return UCD_PROPERTY_IDEOGRAPHIC;
if (c >= 0x018800 && c <= 0x018AF2) return UCD_PROPERTY_IDEOGRAPHIC;
if (c >= 0x01B170 && c <= 0x01B2FB) return UCD_PROPERTY_IDEOGRAPHIC;
break;
case 0x020000:
if (c >= 0x020000 && c <= 0x02A6D6) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x02A700 && c <= 0x02B734) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x02B740 && c <= 0x02B81D) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x02B820 && c <= 0x02CEA1) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x02CEB0 && c <= 0x02EBE0) return UCD_PROPERTY_IDEOGRAPHIC | UCD_PROPERTY_UNIFIED_IDEOGRAPH;
if (c >= 0x02F800 && c <= 0x02FA1D) return UCD_PROPERTY_IDEOGRAPHIC;
break;
}
@@ -434,8 +433,8 @@ static ucd_property properties_Lu(codepoint_t c)
if (c >= 0x0041 && c <= 0x0046) return UCD_PROPERTY_HEX_DIGIT | UCD_PROPERTY_ASCII_HEX_DIGIT;
break;
case 0x0300:
if (c >= 0x03D0 && c <= 0x03D2) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x03D0 && c <= 0x03D2) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0xFF00:
if (c >= 0xFF21 && c <= 0xFF26) return UCD_PROPERTY_HEX_DIGIT;
@@ -443,49 +442,49 @@ static ucd_property properties_Lu(codepoint_t c)
case 0x2100:
if (c == 0x2102) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x2107) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c == 0x2115) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x2119 && c <= 0x211D) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x2124) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x2128) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x212C && c <= 0x212D) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x2133 && c <= 0x2134) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x2145 && c <= 0x2149) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x2133 && c <= 0x2134) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x2145 && c <= 0x2149) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D400:
if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D456 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D456 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D49E && c <= 0x01D49F) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x01D4A2) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D4A5 && c <= 0x01D4A6) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D4A9 && c <= 0x01D4AC) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D4AE && c <= 0x01D4B9) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D4C5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D4AE && c <= 0x01D4B9) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D4C5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D500:
if (c <= 0x01D505) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c <= 0x01D505) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D507 && c <= 0x01D50A) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D50D && c <= 0x01D514) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D516 && c <= 0x01D51C) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D51E && c <= 0x01D539) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D51E && c <= 0x01D539) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D53B && c <= 0x01D53E) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D540 && c <= 0x01D544) return UCD_PROPERTY_OTHER_MATH;
if (c == 0x01D546) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D54A && c <= 0x01D550) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D552) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D552) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D600:
if (c <= 0x01D6A5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c <= 0x01D6A5) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D6A8 && c <= 0x01D6C0) return UCD_PROPERTY_OTHER_MATH;
if (c >= 0x01D6DC && c <= 0x01D6FA) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D6DC && c <= 0x01D6FA) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
case 0x01D700:
if (c >= 0x01D716 && c <= 0x01D734) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D750 && c <= 0x01D76E) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D78A && c <= 0x01D7A8) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D7C4 && c <= 0x01D7CB) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu
if (c >= 0x01D716 && c <= 0x01D734) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D750 && c <= 0x01D76E) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D78A && c <= 0x01D7A8) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
if (c >= 0x01D7C4 && c <= 0x01D7CB) return UCD_PROPERTY_OTHER_MATH; /* Ll|Lu */
break;
}
return 0;
@@ -613,6 +612,7 @@ static ucd_property properties_Mc(codepoint_t c)
if (c >= 0x1C34 && c <= 0x1C35) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x1CE1) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CF2 && c <= 0x1CF3) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x1CF7) return UCD_PROPERTY_DIACRITIC;
break;
case 0x3000:
if (c >= 0x302E && c <= 0x302F) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_OTHER_GRAPHEME_EXTEND;
@@ -709,6 +709,12 @@ static ucd_property properties_Mc(codepoint_t c)
if (c >= 0x011720 && c <= 0x011721) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011726) return UCD_PROPERTY_OTHER_ALPHABETIC;
break;
case 0x011A00:
if (c >= 0x011A07 && c <= 0x011A08) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011A39) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011A57 && c <= 0x011A58) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011A97) return UCD_PROPERTY_OTHER_ALPHABETIC;
break;
case 0x011C00:
if (c == 0x011C2F) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011C3E) return UCD_PROPERTY_OTHER_ALPHABETIC;
@@ -818,6 +824,8 @@ static ucd_property properties_Mn(codepoint_t c)
if (c >= 0x0AC7 && c <= 0x0AC8) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x0ACD) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x0AE2 && c <= 0x0AE3) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x0AFA && c <= 0x0AFC) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x0AFD && c <= 0x0AFF) return UCD_PROPERTY_DIACRITIC;
break;
case 0x0B00:
if (c == 0x0B01) return UCD_PROPERTY_OTHER_ALPHABETIC;
@@ -848,7 +856,8 @@ static ucd_property properties_Mn(codepoint_t c)
if (c >= 0x0CE2 && c <= 0x0CE3) return UCD_PROPERTY_OTHER_ALPHABETIC;
break;
case 0x0D00:
if (c == 0x0D01) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x0D00 && c <= 0x0D01) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x0D3B && c <= 0x0D3C) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x0D41 && c <= 0x0D44) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x0D4D) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x0D62 && c <= 0x0D63) return UCD_PROPERTY_OTHER_ALPHABETIC;
@@ -964,7 +973,7 @@ static ucd_property properties_Mn(codepoint_t c)
case 0x1D00:
if (c >= 0x1DC4 && c <= 0x1DCF) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1DE7 && c <= 0x1DF4) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x1DF5) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1DF5 && c <= 0x1DF9) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1DFD && c <= 0x1DFF) return UCD_PROPERTY_DIACRITIC;
break;
case 0x2000:
@@ -1111,6 +1120,16 @@ static ucd_property properties_Mn(codepoint_t c)
if (c >= 0x011727 && c <= 0x01172A) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x01172B) return UCD_PROPERTY_DIACRITIC;
break;
case 0x011A00:
if (c >= 0x011A01 && c <= 0x011A0A) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011A34) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x011A35 && c <= 0x011A3E) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011A47) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x011A51 && c <= 0x011A5B) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011A8A && c <= 0x011A96) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011A98) return UCD_PROPERTY_EXTENDER;
if (c == 0x011A99) return UCD_PROPERTY_DIACRITIC;
break;
case 0x011C00:
if (c >= 0x011C30 && c <= 0x011C36) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011C38 && c <= 0x011C3D) return UCD_PROPERTY_OTHER_ALPHABETIC;
@@ -1120,6 +1139,16 @@ static ucd_property properties_Mn(codepoint_t c)
if (c >= 0x011CB2 && c <= 0x011CB3) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011CB5 && c <= 0x011CB6) return UCD_PROPERTY_OTHER_ALPHABETIC;
break;
case 0x011D00:
if (c >= 0x011D31 && c <= 0x011D36) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011D3A) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011D3C && c <= 0x011D3D) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011D3F && c <= 0x011D41) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c == 0x011D42) return UCD_PROPERTY_DIACRITIC;
if (c == 0x011D43) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x011D44 && c <= 0x011D45) return UCD_PROPERTY_DIACRITIC;
if (c == 0x011D47) return UCD_PROPERTY_OTHER_ALPHABETIC;
break;
case 0x016A00:
if (c >= 0x016AF0 && c <= 0x016AF4) return UCD_PROPERTY_DIACRITIC;
break;
@@ -1165,7 +1194,7 @@ static ucd_property properties_Nd(codepoint_t c)
switch (c & 0xFFFFFF00)
{
case 0x0000:
if (c >= 0x0030 && c <= 0x0039) return UCD_PROPERTY_HEX_DIGIT | UCD_PROPERTY_ASCII_HEX_DIGIT | UCD_PROPERTY_EMOJI;
if (c >= 0x0030 && c <= 0x0039) return UCD_PROPERTY_HEX_DIGIT | UCD_PROPERTY_ASCII_HEX_DIGIT | UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_COMPONENT;
break;
case 0xFF00:
if (c >= 0xFF10 && c <= 0xFF19) return UCD_PROPERTY_HEX_DIGIT;
@@ -1279,10 +1308,10 @@ static ucd_property properties_Pe(codepoint_t c)
break;
case 0x2700:
if (c == 0x27C6) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x27E6 && c <= 0x27EF) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; // Pe|Ps
if (c >= 0x27E6 && c <= 0x27EF) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; /* Pe|Ps */
return UCD_PROPERTY_PATTERN_SYNTAX;
case 0x2900:
return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; // Pe|Ps
return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; /* Pe|Ps */
case 0x2E00:
return UCD_PROPERTY_PATTERN_SYNTAX;
case 0x3000:
@@ -1337,9 +1366,9 @@ static ucd_property properties_Po(codepoint_t c)
case 0x0000:
if (c == 0x0021) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_EXCLAMATION_MARK;
if (c == 0x0022) return UCD_PROPERTY_QUOTATION_MARK | UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x0023) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x0023) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_PATTERN_SYNTAX | UCD_PROPERTY_EMOJI_COMPONENT;
if (c == 0x0027) return UCD_PROPERTY_QUOTATION_MARK | UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x002A) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x002A) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_PATTERN_SYNTAX | UCD_PROPERTY_EMOJI_COMPONENT;
if (c == 0x002C) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COMMA;
if (c == 0x002E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x003A) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COLON;
@@ -1609,6 +1638,11 @@ static ucd_property properties_Po(codepoint_t c)
case 0x11700:
if (c >= 0x01173C && c <= 0x01173E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL;
break;
case 0x11A00:
if (c >= 0x011A42 && c <= 0x011A43) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL;
if (c >= 0x011A9B && c <= 0x011A9C) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL;
if (c >= 0x011AA1 && c <= 0x011AA2) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11C00:
if (c >= 0x011C41 && c <= 0x011C42) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL;
if (c == 0x011C43) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
@@ -1664,7 +1698,7 @@ static ucd_property properties_Ps(codepoint_t c)
break;
case 0x2700:
if (c == 0x27C5) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x27E6 && c <= 0x27EF) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; // Pe|Ps
if (c >= 0x27E6 && c <= 0x27EF) return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX; /* Pe|Ps */
return UCD_PROPERTY_PATTERN_SYNTAX;
case 0x2900:
return UCD_PROPERTY_OTHER_MATH | UCD_PROPERTY_PATTERN_SYNTAX;
@@ -1746,7 +1780,7 @@ static ucd_property properties_Sk(codepoint_t c)
if (c == 0xFFE3) return UCD_PROPERTY_DIACRITIC;
break;
case 0x01F300:
return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER;
return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER | UCD_PROPERTY_EMOJI_COMPONENT;
}
return 0;
}
@@ -1839,7 +1873,7 @@ static ucd_property properties_So(codepoint_t c)
if (c == 0x23F3) return UCD_PROPERTY_PATTERN_SYNTAX | UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x23E9 && c <= 0x23F3) return UCD_PROPERTY_PATTERN_SYNTAX | UCD_PROPERTY_EMOJI;
if (c >= 0x23F8 && c <= 0x23FA) return UCD_PROPERTY_PATTERN_SYNTAX | UCD_PROPERTY_EMOJI;
if (c >= 0x23E3 && c <= 0x23FE) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c >= 0x23E3) return UCD_PROPERTY_PATTERN_SYNTAX;
break;
case 0x2400:
if (c >= 0x2400 && c <= 0x244A) return UCD_PROPERTY_PATTERN_SYNTAX;
@@ -1979,7 +2013,7 @@ static ucd_property properties_So(codepoint_t c)
if (c >= 0x01F170 && c <= 0x01F189) return UCD_PROPERTY_OTHER_ALPHABETIC | UCD_PROPERTY_OTHER_UPPERCASE;
if (c == 0x01F18E) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F191 && c <= 0x01F19A) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F1E6) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F1E6) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_REGIONAL_INDICATOR | UCD_PROPERTY_EMOJI_COMPONENT;
break;
case 0x01F200:
if (c == 0x01F201) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
@@ -2074,23 +2108,29 @@ static ucd_property properties_So(codepoint_t c)
if (c >= 0x01F6EB && c <= 0x01F6EC) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c == 0x01F6F0) return UCD_PROPERTY_EMOJI;
if (c == 0x01F6F3) return UCD_PROPERTY_EMOJI;
if (c >= 0x01F6F4 && c <= 0x01F6F6) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F6F4 && c <= 0x01F6F8) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
break;
case 0x01F900:
if (c <= 0x01F90B) return 0;
if (c >= 0x01F918 && c <= 0x01F91C) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c >= 0x01F910 && c <= 0x01F91D) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c == 0x01F91E) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c >= 0x01F91E && c <= 0x01F91F) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c == 0x01F926) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c >= 0x01F920 && c <= 0x01F927) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c == 0x01F930) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c >= 0x01F920 && c <= 0x01F92F) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F930 && c <= 0x01F932) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c == 0x01F93B) return 0;
if (c >= 0x01F93A && c <= 0x01F93C) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F933 && c <= 0x01F93E) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c == 0x01F946) return 0;
if (c >= 0x01F940 && c <= 0x01F94B) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c == 0x01F94C) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F950 && c <= 0x01F95E) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F95F && c <= 0x01F96B) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F980 && c <= 0x01F991) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F992 && c <= 0x01F997) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c == 0x01F9C0) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
if (c >= 0x01F9D1 && c <= 0x01F9DD) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION | UCD_PROPERTY_EMOJI_MODIFIER_BASE;
if (c >= 0x01F9D0 && c <= 0x01F9E6) return UCD_PROPERTY_EMOJI | UCD_PROPERTY_EMOJI_PRESENTATION;
return UCD_PROPERTY_EMOJI;
}
return 0;
@@ -2132,6 +2172,6 @@ ucd_property ucd_properties(codepoint_t c, ucd_category category)
case UCD_CATEGORY_Zl: return UCD_PROPERTY_WHITE_SPACE | UCD_PROPERTY_PATTERN_WHITE_SPACE;
case UCD_CATEGORY_Zp: return UCD_PROPERTY_WHITE_SPACE | UCD_PROPERTY_PATTERN_WHITE_SPACE | ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR;
case UCD_CATEGORY_Zs: return properties_Zs(c);
default: return 0; // Co Cs Ii Lt Me
default: return 0; /* Co Cs Ii Lt Me */
};
}

+ 765
- 701
src/ucd-tools/src/scripts.c
File diff suppressed because it is too large
View File


+ 3
- 0
src/ucd-tools/src/tostring.c View File

@@ -120,6 +120,7 @@ const char *ucd_get_script_string(ucd_script s)
"Geok",
"Geor",
"Glag",
"Gonm",
"Goth",
"Gran",
"Grek",
@@ -217,6 +218,7 @@ const char *ucd_get_script_string(ucd_script s)
"Sind",
"Sinh",
"Sora",
"Soyo",
"Sund",
"Sylo",
"Syrc",
@@ -246,6 +248,7 @@ const char *ucd_get_script_string(ucd_script s)
"Xpeo",
"Xsux",
"Yiii",
"Zanb",
"Zinh",
"Zmth",
"Zsym",

+ 36
- 26
src/ucd-tools/tests/printcdata.c View File

@@ -17,6 +17,7 @@
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

#include "config.h"
#include "ucd/ucd.h"

#include <locale.h>
@@ -25,6 +26,13 @@
#include <wchar.h>
#include <wctype.h>

#ifndef HAVE_ISWBLANK
static int iswblank(wint_t c)
{
return iswspace(c) && !(c >= 0x0A && c <= 0x0D);
}
#endif

void fput_utf8c(FILE *out, codepoint_t c)
{
if (c < 0x80)
@@ -86,7 +94,7 @@ void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'c': // character
case 'c': /* character */
switch (c)
{
case '\t': fputs("\\t", out); break;
@@ -95,10 +103,10 @@ void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
default: fput_utf8c(out, c); break;
}
break;
case 'h': // hexadecimal (lower)
case 'h': /* hexadecimal (lower) */
fprintf(out, "%06x", c);
break;
case 'H': // hexadecimal (upper)
case 'H': /* hexadecimal (upper) */
fprintf(out, "%06X", c);
break;
}
@@ -108,40 +116,40 @@ void uprintf_is(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'A': // alpha-numeric
case 'A': /* alpha-numeric */
fputc(iswalnum(c) ? '1' : '0', out);
break;
case 'a': // alpha
case 'a': /* alpha */
fputc(iswalpha(c) ? '1' : '0', out);
break;
case 'b': // blank
case 'b': /* blank */
fputc(iswblank(c) ? '1' : '0', out);
break;
case 'c': // control
case 'c': /* control */
fputc(iswcntrl(c) ? '1' : '0', out);
break;
case 'd': // numeric
case 'd': /* numeric */
fputc(iswdigit(c) ? '1' : '0', out);
break;
case 'g': // glyph
case 'g': /* glyph */
fputc(iswgraph(c) ? '1' : '0', out);
break;
case 'l': // lower case
case 'l': /* lower case */
fputc(iswlower(c) ? '1' : '0', out);
break;
case 'P': // printable
case 'P': /* printable */
fputc(iswprint(c) ? '1' : '0', out);
break;
case 'p': // punctuation
case 'p': /* punctuation */
fputc(iswpunct(c) ? '1' : '0', out);
break;
case 's': // whitespace
case 's': /* whitespace */
fputc(iswspace(c) ? '1' : '0', out);
break;
case 'u': // upper case
case 'u': /* upper case */
fputc(iswupper(c) ? '1' : '0', out);
break;
case 'x': // xdigit
case 'x': /* xdigit */
fputc(iswxdigit(c) ? '1' : '0', out);
break;
}
@@ -154,31 +162,31 @@ void uprintf(FILE *out, codepoint_t c, const char *format)
case '%':
switch (*++format)
{
case 'c': // category
case 'c': /* category */
fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
break;
case 'C': // category group
case 'C': /* category group */
fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
break;
case 'p': // codepoint
case 'p': /* codepoint */
uprintf_codepoint(out, c, *++format);
break;
case 'P': // properties
case 'P': /* properties */
fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c)));
break;
case 'i': // is*
case 'i': /* is* */
uprintf_is(out, c, *++format);
break;
case 'L': // lowercase
case 'L': /* lowercase */
uprintf_codepoint(out, towlower(c), *++format);
break;
case 's': // script
case 's': /* script */
fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
break;
case 'T': // titlecase
case 'T': /* titlecase */
uprintf_codepoint(out, ucd_totitle(c), *++format);
break;
case 'U': // uppercase
case 'U': /* uppercase */
uprintf_codepoint(out, towupper(c), *++format);
break;
}
@@ -224,7 +232,8 @@ int main(int argc, char **argv)
{
FILE *in = NULL;
const char *format = NULL;
for (int argn = 1; argn != argc; ++argn)
int argn;
for (argn = 1; argn != argc; ++argn)
{
const char *arg = argv[argn];
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
@@ -250,7 +259,8 @@ int main(int argc, char **argv)
}
else
{
for (codepoint_t c = 0; c <= 0x10FFFF; ++c)
codepoint_t c;
for (c = 0; c <= 0x10FFFF; ++c)
uprintf(stdout, c, format ? format :
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n");
}

+ 28
- 26
src/ucd-tools/tests/printucddata.c View File

@@ -83,7 +83,7 @@ void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'c': // character
case 'c': /* character */
switch (c)
{
case '\t': fputs("\\t", out); break;
@@ -92,10 +92,10 @@ void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
default: fput_utf8c(out, c); break;
}
break;
case 'h': // hexadecimal (lower)
case 'h': /* hexadecimal (lower) */
fprintf(out, "%06x", c);
break;
case 'H': // hexadecimal (upper)
case 'H': /* hexadecimal (upper) */
fprintf(out, "%06X", c);
break;
}
@@ -105,40 +105,40 @@ void uprintf_is(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'A': // alpha-numeric
case 'A': /* alpha-numeric */
fputc(ucd_isalnum(c) ? '1' : '0', out);
break;
case 'a': // alpha
case 'a': /* alpha */
fputc(ucd_isalpha(c) ? '1' : '0', out);
break;
case 'b': // blank
case 'b': /* blank */
fputc(ucd_isblank(c) ? '1' : '0', out);
break;
case 'c': // control
case 'c': /* control */
fputc(ucd_iscntrl(c) ? '1' : '0', out);
break;
case 'd': // numeric
case 'd': /* numeric */
fputc(ucd_isdigit(c) ? '1' : '0', out);
break;
case 'g': // glyph
case 'g': /* glyph */
fputc(ucd_isgraph(c) ? '1' : '0', out);
break;
case 'l': // lower case
case 'l': /* lower case */
fputc(ucd_islower(c) ? '1' : '0', out);
break;
case 'P': // printable
case 'P': /* printable */
fputc(ucd_isprint(c) ? '1' : '0', out);
break;
case 'p': // punctuation
case 'p': /* punctuation */
fputc(ucd_ispunct(c) ? '1' : '0', out);
break;
case 's': // whitespace
case 's': /* whitespace */
fputc(ucd_isspace(c) ? '1' : '0', out);
break;
case 'u': // upper case
case 'u': /* upper case */
fputc(ucd_isupper(c) ? '1' : '0', out);
break;
case 'x': // xdigit
case 'x': /* xdigit */
fputc(ucd_isxdigit(c) ? '1' : '0', out);
break;
}
@@ -151,31 +151,31 @@ void uprintf(FILE *out, codepoint_t c, const char *format)
case '%':
switch (*++format)
{
case 'c': // category
case 'c': /* category */
fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
break;
case 'C': // category group
case 'C': /* category group */
fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
break;
case 'p': // codepoint
case 'p': /* codepoint */
uprintf_codepoint(out, c, *++format);
break;
case 'P': // properties
case 'P': /* properties */
fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c)));
break;
case 'i': // is*
case 'i': /* is* */
uprintf_is(out, c, *++format);
break;
case 'L': // lowercase
case 'L': /* lowercase */
uprintf_codepoint(out, ucd_tolower(c), *++format);
break;
case 's': // script
case 's': /* script */
fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
break;
case 'T': // titlecase
case 'T': /* titlecase */
uprintf_codepoint(out, ucd_totitle(c), *++format);
break;
case 'U': // uppercase
case 'U': /* uppercase */
uprintf_codepoint(out, ucd_toupper(c), *++format);
break;
}
@@ -221,7 +221,8 @@ int main(int argc, char **argv)
{
FILE *in = NULL;
const char *format = NULL;
for (int argn = 1; argn != argc; ++argn)
int argn;
for (argn = 1; argn != argc; ++argn)
{
const char *arg = argv[argn];
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
@@ -245,7 +246,8 @@ int main(int argc, char **argv)
}
else
{
for (codepoint_t c = 0; c <= 0x10FFFF; ++c)
codepoint_t c;
for (c = 0; c <= 0x10FFFF; ++c)
uprintf(stdout, c, format ? format :
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n");
}

+ 4
- 3
src/ucd-tools/tools/case.py View File

@@ -51,14 +51,15 @@ if __name__ == '__main__':
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

// NOTE: This file is automatically generated from the UnicodeData.txt file in
// the Unicode Character database by the ucd-tools/tools/categories.py script.
/* NOTE: This file is automatically generated from the UnicodeData.txt file in
* the Unicode Character database by the ucd-tools/tools/categories.py script.
*/

#include "ucd/ucd.h"

#include <stddef.h>

// Unicode Character Data %s
/* Unicode Character Data %s */

struct case_conversion_entry
{

+ 8
- 7
src/ucd-tools/tools/categories.py View File

@@ -110,8 +110,9 @@ if __name__ == '__main__':
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

// NOTE: This file is automatically generated from the UnicodeData.txt file in
// the Unicode Character database by the ucd-tools/tools/categories.py script.
/* NOTE: This file is automatically generated from the UnicodeData.txt file in
* the Unicode Character database by the ucd-tools/tools/categories.py script.
*/

#include "ucd/ucd.h"

@@ -149,7 +150,7 @@ if __name__ == '__main__':
#define Zs UCD_CATEGORY_Zs
#define Ii UCD_CATEGORY_Ii

// Unicode Character Data %s
/* Unicode Character Data %s */
""" % ucd_version)

for category in special_categories:
@@ -187,7 +188,7 @@ if __name__ == '__main__':
sys.stdout.write('{\n')
for codepoint, table in sorted(category_tables[table_index].items()):
if isinstance(table, str):
sys.stdout.write('\tcategories_%s, // %s\n' % (table, codepoint))
sys.stdout.write('\tcategories_%s, /* %s */\n' % (table, codepoint))
else:
sys.stdout.write('\tcategories_%s,\n' % codepoint)
sys.stdout.write('};\n')
@@ -197,14 +198,14 @@ if __name__ == '__main__':
sys.stdout.write('{\n')
for codepoints, category, comment in category_sets:
if category:
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, category, codepoints, comment))
else:
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst uint8_t *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn (ucd_category)table[c % 256];\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ii; // Invalid Unicode Codepoint\n')
sys.stdout.write('\treturn Ii; /* Invalid Unicode Codepoint */\n')
sys.stdout.write('}\n')

sys.stdout.write("""

+ 2
- 0
src/ucd-tools/tools/printdata.py View File

@@ -166,6 +166,8 @@ def properties(data):
props += (2 ** 34) * data.get('Emoji_Presentation', 0) # emoji-data
props += (2 ** 35) * data.get('Emoji_Modifier', 0) # emoji-data
props += (2 ** 36) * data.get('Emoji_Modifier_Base', 0) # emoji-data
props += (2 ** 37) * data.get('Regional_Indicator', 0) # PropList 10.0.0
props += (2 ** 38) * data.get('Emoji_Component', 0) # emoji-data 5.0
# eSpeak NG extended properties:
props += (2 ** 52) * data.get('Inverted_Terminal_Punctuation', 0)
props += (2 ** 53) * data.get('Punctuation_In_Word', 0)

+ 11
- 7
src/ucd-tools/tools/scripts.py View File

@@ -104,8 +104,9 @@ if __name__ == '__main__':
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

// NOTE: This file is automatically generated from the Scripts.txt file in
// the Unicode Character database by the ucd-tools/tools/scripts.py script.
/* NOTE: This file is automatically generated from the Scripts.txt file in
* the Unicode Character database by the ucd-tools/tools/scripts.py script.
*/

#include "ucd/ucd.h"

@@ -152,6 +153,7 @@ if __name__ == '__main__':
#define Geok UCD_SCRIPT_Geok
#define Geor UCD_SCRIPT_Geor
#define Glag UCD_SCRIPT_Glag
#define Gonm UCD_SCRIPT_Gonm
#define Goth UCD_SCRIPT_Goth
#define Gran UCD_SCRIPT_Gran
#define Grek UCD_SCRIPT_Grek
@@ -249,6 +251,7 @@ if __name__ == '__main__':
#define Sind UCD_SCRIPT_Sind
#define Sinh UCD_SCRIPT_Sinh
#define Sora UCD_SCRIPT_Sora
#define Soyo UCD_SCRIPT_Soyo
#define Sund UCD_SCRIPT_Sund
#define Sylo UCD_SCRIPT_Sylo
#define Syrc UCD_SCRIPT_Syrc
@@ -278,6 +281,7 @@ if __name__ == '__main__':
#define Xpeo UCD_SCRIPT_Xpeo
#define Xsux UCD_SCRIPT_Xsux
#define Yiii UCD_SCRIPT_Yiii
#define Zanb UCD_SCRIPT_Zanb
#define Zinh UCD_SCRIPT_Zinh
#define Zmth UCD_SCRIPT_Zmth
#define Zsym UCD_SCRIPT_Zsym
@@ -285,7 +289,7 @@ if __name__ == '__main__':
#define Zyyy UCD_SCRIPT_Zyyy
#define Zzzz UCD_SCRIPT_Zzzz

// Unicode Character Data %s
/* Unicode Character Data %s */
""" % ucd_version)

for script in special_scripts:
@@ -323,7 +327,7 @@ if __name__ == '__main__':
sys.stdout.write('{\n')
for codepoint, table in sorted(script_tables[table_index].items()):
if isinstance(table, str):
sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
sys.stdout.write('\tscripts_%s, /* %s */\n' % (table, codepoint))
else:
sys.stdout.write('\tscripts_%s,\n' % codepoint)
sys.stdout.write('};\n')
@@ -333,12 +337,12 @@ if __name__ == '__main__':
sys.stdout.write('{\n')
for codepoints, script, comment in script_sets:
if script:
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, script, codepoints, comment))
else:
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn (ucd_script)table[c % 256];\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
sys.stdout.write('\treturn Zzzz; /* Invalid Unicode Codepoint */\n')
sys.stdout.write('}\n')

Loading…
Cancel
Save