typedef uint64_t ucd_property; | typedef uint64_t ucd_property; | ||||
enum | enum | ||||
{ | { | ||||
UCD_PROPERTY_WHITE_SPACE = 0x0000000000000001ull, /**< @brief White_Space */ | |||||
UCD_PROPERTY_BIDI_CONTROL = 0x0000000000000002ull, /**< @brief Bidi_Control */ | |||||
UCD_PROPERTY_JOIN_CONTROL = 0x0000000000000004ull, /**< @brief Join_Control */ | |||||
UCD_PROPERTY_DASH = 0x0000000000000008ull, /**< @brief Dash */ | |||||
UCD_PROPERTY_HYPHEN = 0x0000000000000010ull, /**< @brief Hyphen */ | |||||
UCD_PROPERTY_QUOTATION_MARK = 0x0000000000000020ull, /**< @brief Quotation_Mark */ | |||||
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x0000000000000040ull, /**< @brief Terminal_Punctuation */ | |||||
UCD_PROPERTY_OTHER_MATH = 0x0000000000000080ull, /**< @brief Other_Math */ | |||||
UCD_PROPERTY_HEX_DIGIT = 0x0000000000000100ull, /**< @brief Hex_Digit */ | |||||
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x0000000000000200ull, /**< @brief ASCII_Hex_Digit */ | |||||
UCD_PROPERTY_OTHER_ALPHABETIC = 0x0000000000000400ull, /**< @brief Other_Alphabetic */ | |||||
UCD_PROPERTY_IDEOGRAPHIC = 0x0000000000000800ull, /**< @brief Ideographic */ | |||||
UCD_PROPERTY_DIACRITIC = 0x0000000000001000ull, /**< @brief Diacritic */ | |||||
UCD_PROPERTY_EXTENDER = 0x0000000000002000ull, /**< @brief Extender */ | |||||
UCD_PROPERTY_OTHER_LOWERCASE = 0x0000000000004000ull, /**< @brief Other_Lowercase */ | |||||
UCD_PROPERTY_OTHER_UPPERCASE = 0x0000000000008000ull, /**< @brief Other_Uppercase */ | |||||
UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x0000000000010000ull, /**< @brief Noncharacter_Code_Point */ | |||||
UCD_PROPERTY_OTHER_GRAPHEME_EXTEND = 0x0000000000020000ull, /**< @brief Other_Grapheme_Extend */ | |||||
UCD_PROPERTY_IDS_BINARY_OPERATOR = 0x0000000000040000ull, /**< @brief IDS_Binary_Operator */ | |||||
UCD_PROPERTY_IDS_TRINARY_OPERATOR = 0x0000000000080000ull, /**< @brief IDS_Trinary_Operator */ | |||||
UCD_PROPERTY_RADICAL = 0x0000000000100000ull, /**< @brief Radical */ | |||||
UCD_PROPERTY_UNIFIED_IDEOGRAPH = 0x0000000000200000ull, /**< @brief Unified_Ideograph */ | |||||
UCD_PROPERTY_WHITE_SPACE = 0x0000000000000001ull, /**< @brief White_Space */ | |||||
UCD_PROPERTY_BIDI_CONTROL = 0x0000000000000002ull, /**< @brief Bidi_Control */ | |||||
UCD_PROPERTY_JOIN_CONTROL = 0x0000000000000004ull, /**< @brief Join_Control */ | |||||
UCD_PROPERTY_DASH = 0x0000000000000008ull, /**< @brief Dash */ | |||||
UCD_PROPERTY_HYPHEN = 0x0000000000000010ull, /**< @brief Hyphen */ | |||||
UCD_PROPERTY_QUOTATION_MARK = 0x0000000000000020ull, /**< @brief Quotation_Mark */ | |||||
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x0000000000000040ull, /**< @brief Terminal_Punctuation */ | |||||
UCD_PROPERTY_OTHER_MATH = 0x0000000000000080ull, /**< @brief Other_Math */ | |||||
UCD_PROPERTY_HEX_DIGIT = 0x0000000000000100ull, /**< @brief Hex_Digit */ | |||||
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x0000000000000200ull, /**< @brief ASCII_Hex_Digit */ | |||||
UCD_PROPERTY_OTHER_ALPHABETIC = 0x0000000000000400ull, /**< @brief Other_Alphabetic */ | |||||
UCD_PROPERTY_IDEOGRAPHIC = 0x0000000000000800ull, /**< @brief Ideographic */ | |||||
UCD_PROPERTY_DIACRITIC = 0x0000000000001000ull, /**< @brief Diacritic */ | |||||
UCD_PROPERTY_EXTENDER = 0x0000000000002000ull, /**< @brief Extender */ | |||||
UCD_PROPERTY_OTHER_LOWERCASE = 0x0000000000004000ull, /**< @brief Other_Lowercase */ | |||||
UCD_PROPERTY_OTHER_UPPERCASE = 0x0000000000008000ull, /**< @brief Other_Uppercase */ | |||||
UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x0000000000010000ull, /**< @brief Noncharacter_Code_Point */ | |||||
UCD_PROPERTY_OTHER_GRAPHEME_EXTEND = 0x0000000000020000ull, /**< @brief Other_Grapheme_Extend */ | |||||
UCD_PROPERTY_IDS_BINARY_OPERATOR = 0x0000000000040000ull, /**< @brief IDS_Binary_Operator */ | |||||
UCD_PROPERTY_IDS_TRINARY_OPERATOR = 0x0000000000080000ull, /**< @brief IDS_Trinary_Operator */ | |||||
UCD_PROPERTY_RADICAL = 0x0000000000100000ull, /**< @brief Radical */ | |||||
UCD_PROPERTY_UNIFIED_IDEOGRAPH = 0x0000000000200000ull, /**< @brief Unified_Ideograph */ | |||||
UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT = 0x0000000000400000ull, /**< @brief Other_Default_Ignorable_Code_Point */ | |||||
}; | }; | ||||
/** @brief Return the properties of the specified codepoint. | /** @brief Return the properties of the specified codepoint. | ||||
IDS_Trinary_Operator = UCD_PROPERTY_IDS_TRINARY_OPERATOR, /**< @brief IDS_Trinary_Operator */ | IDS_Trinary_Operator = UCD_PROPERTY_IDS_TRINARY_OPERATOR, /**< @brief IDS_Trinary_Operator */ | ||||
Radical = UCD_PROPERTY_RADICAL, /**< @brief Radical */ | Radical = UCD_PROPERTY_RADICAL, /**< @brief Radical */ | ||||
Unified_Ideograph = UCD_PROPERTY_UNIFIED_IDEOGRAPH, /**< @brief Unified_Ideograph */ | Unified_Ideograph = UCD_PROPERTY_UNIFIED_IDEOGRAPH, /**< @brief Unified_Ideograph */ | ||||
Other_Default_Ignorable_Code_Point = UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT, /**< @brief Other_Default_Ignorable_Code_Point */ | |||||
}; | }; | ||||
/** @brief Return the properties of the specified codepoint. | /** @brief Return the properties of the specified codepoint. |
static int properties_Cn(codepoint_t c) | static int properties_Cn(codepoint_t c) | ||||
{ | { | ||||
if (c >= 0xFDD0 && c <= 0xFDEF) return UCD_PROPERTY_NONCHARACTER_CODE_POINT; | |||||
switch (c & 0xFFFF0000) | |||||
{ | |||||
case 0x0000: | |||||
if (c == 0x2065) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0xFFF0 && c <= 0xFFF8) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0xFDD0 && c <= 0xFDEF) return UCD_PROPERTY_NONCHARACTER_CODE_POINT; | |||||
break; | |||||
case 0x0E0000: | |||||
if (c == 0xE0000) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0xE0002 && c <= 0xE001F) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0xE0080 && c <= 0xE00FF) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0xE01F0 && c <= 0xE0FFF) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
break; | |||||
} | |||||
if ((c & 0x0000FFFF) >= 0xFFFE) return UCD_PROPERTY_NONCHARACTER_CODE_POINT; | if ((c & 0x0000FFFF) >= 0xFFFE) return UCD_PROPERTY_NONCHARACTER_CODE_POINT; | ||||
return 0; | return 0; | ||||
} | } | ||||
if (c >= 0x03F0 && c <= 0x03F1) return UCD_PROPERTY_OTHER_MATH; | if (c >= 0x03F0 && c <= 0x03F1) return UCD_PROPERTY_OTHER_MATH; | ||||
if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x03F4 && c <= 0x03F5) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
break; | break; | ||||
case 0xFF00: | |||||
if (c >= 0xFF41 && c <= 0xFF46) return UCD_PROPERTY_HEX_DIGIT; | |||||
break; | |||||
case 0x2100: | case 0x2100: | ||||
if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x210A && c <= 0x2113) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x212F && c <= 0x2131) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x213C && c <= 0x213F) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
if (c >= 0x2145 && c <= 0x2149) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x2145 && c <= 0x2149) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
break; | break; | ||||
case 0xFF00: | |||||
if (c >= 0xFF41 && c <= 0xFF46) return UCD_PROPERTY_HEX_DIGIT; | |||||
break; | |||||
case 0x01D400: | case 0x01D400: | ||||
if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x01D400 && c <= 0x01D454) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
if (c >= 0x01D456 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | if (c >= 0x01D456 && c <= 0x01D49C) return UCD_PROPERTY_OTHER_MATH; // Ll|Lu | ||||
if (c == 0x00AA) return UCD_PROPERTY_OTHER_LOWERCASE; | if (c == 0x00AA) return UCD_PROPERTY_OTHER_LOWERCASE; | ||||
if (c == 0x00BA) return UCD_PROPERTY_OTHER_LOWERCASE; | if (c == 0x00BA) return UCD_PROPERTY_OTHER_LOWERCASE; | ||||
break; | break; | ||||
case 0x1100: | |||||
if (c >= 0x115F && c <= 0x1160) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
break; | |||||
case 0x2100: | case 0x2100: | ||||
if (c >= 0x2135 && c <= 0x2138) return UCD_PROPERTY_OTHER_MATH; | if (c >= 0x2135 && c <= 0x2138) return UCD_PROPERTY_OTHER_MATH; | ||||
break; | break; | ||||
case 0x3000: | case 0x3000: | ||||
if (c == 0x3006) return UCD_PROPERTY_IDEOGRAPHIC; | if (c == 0x3006) return UCD_PROPERTY_IDEOGRAPHIC; | ||||
break; | break; | ||||
case 0x3100: | |||||
if (c == 0x3164) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
break; | |||||
case 0xAA00: | case 0xAA00: | ||||
if (c == 0xAAC0) return UCD_PROPERTY_DIACRITIC; | if (c == 0xAAC0) return UCD_PROPERTY_DIACRITIC; | ||||
if (c == 0xAAC2) return UCD_PROPERTY_DIACRITIC; | if (c == 0xAAC2) return UCD_PROPERTY_DIACRITIC; | ||||
if (c >= 0xFA23 && c <= 0xFA24) return UCD_PROPERTY_UNIFIED_IDEOGRAPH; | if (c >= 0xFA23 && c <= 0xFA24) return UCD_PROPERTY_UNIFIED_IDEOGRAPH; | ||||
if (c >= 0xFA27 && c <= 0xFA29) return UCD_PROPERTY_UNIFIED_IDEOGRAPH; | if (c >= 0xFA27 && c <= 0xFA29) return UCD_PROPERTY_UNIFIED_IDEOGRAPH; | ||||
break; | break; | ||||
case 0xFF00: | |||||
if (c == 0xFFA0) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
break; | |||||
case 0x11300: | case 0x11300: | ||||
if (c == 0x01135D) return UCD_PROPERTY_EXTENDER; | if (c == 0x01135D) return UCD_PROPERTY_EXTENDER; | ||||
break; | break; | ||||
if (c >= 0x0300 && c <= 0x0344) return UCD_PROPERTY_DIACRITIC; | if (c >= 0x0300 && c <= 0x0344) return UCD_PROPERTY_DIACRITIC; | ||||
if (c == 0x0345) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_OTHER_ALPHABETIC | UCD_PROPERTY_OTHER_LOWERCASE; | if (c == 0x0345) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_OTHER_ALPHABETIC | UCD_PROPERTY_OTHER_LOWERCASE; | ||||
if (c >= 0x0346 && c <= 0x034E) return UCD_PROPERTY_DIACRITIC; | if (c >= 0x0346 && c <= 0x034E) return UCD_PROPERTY_DIACRITIC; | ||||
if (c == 0x034F) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0x0350 && c <= 0x0357) return UCD_PROPERTY_DIACRITIC; | if (c >= 0x0350 && c <= 0x0357) return UCD_PROPERTY_DIACRITIC; | ||||
if (c >= 0x035D && c <= 0x0362) return UCD_PROPERTY_DIACRITIC; | if (c >= 0x035D && c <= 0x0362) return UCD_PROPERTY_DIACRITIC; | ||||
break; | break; | ||||
if (c >= 0x1732 && c <= 0x1733) return UCD_PROPERTY_OTHER_ALPHABETIC; | if (c >= 0x1732 && c <= 0x1733) return UCD_PROPERTY_OTHER_ALPHABETIC; | ||||
if (c >= 0x1752 && c <= 0x1753) return UCD_PROPERTY_OTHER_ALPHABETIC; | if (c >= 0x1752 && c <= 0x1753) return UCD_PROPERTY_OTHER_ALPHABETIC; | ||||
if (c >= 0x1772 && c <= 0x1773) return UCD_PROPERTY_OTHER_ALPHABETIC; | if (c >= 0x1772 && c <= 0x1773) return UCD_PROPERTY_OTHER_ALPHABETIC; | ||||
if (c >= 0x17B4 && c <= 0x17B5) return UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT; | |||||
if (c >= 0x17B7 && c <= 0x17BD) return UCD_PROPERTY_OTHER_ALPHABETIC; | if (c >= 0x17B7 && c <= 0x17BD) return UCD_PROPERTY_OTHER_ALPHABETIC; | ||||
if (c == 0x17C6) return UCD_PROPERTY_OTHER_ALPHABETIC; | if (c == 0x17C6) return UCD_PROPERTY_OTHER_ALPHABETIC; | ||||
if (c >= 0x17C9 && c <= 0x17D3) return UCD_PROPERTY_DIACRITIC; | if (c >= 0x17C9 && c <= 0x17D3) return UCD_PROPERTY_DIACRITIC; |
props += (2 ** 19) * data.get('IDS_Trinary_Operator', 0) | props += (2 ** 19) * data.get('IDS_Trinary_Operator', 0) | ||||
props += (2 ** 20) * data.get('Radical', 0) | props += (2 ** 20) * data.get('Radical', 0) | ||||
props += (2 ** 21) * data.get('Unified_Ideograph', 0) | props += (2 ** 21) * data.get('Unified_Ideograph', 0) | ||||
props += (2 ** 22) * data.get('Other_Default_Ignorable_Code_Point', 0) | |||||
return props | return props | ||||
if __name__ == '__main__': | if __name__ == '__main__': |