Browse Source

Add Noncharacter_Code_Point support from PropList.txt.

master
Reece H. Dunn 8 years ago
parent
commit
9dc44bf0d4
3 changed files with 48 additions and 37 deletions
  1. 20
    18
      src/include/ucd/ucd.h
  2. 9
    1
      src/proplist.c
  3. 19
    18
      tools/printdata.py

+ 20
- 18
src/include/ucd/ucd.h View File

@@ -329,23 +329,24 @@ ucd_script ucd_lookup_script(codepoint_t c);
*/
typedef enum ucd_property_
{
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space PropList */
UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> DispositionType (enabled check only) */
UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control PropList */
UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control PropList */
UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash PropList */
UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen PropList */
UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark PropList */
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000080, /**< @brief Terminal_Punctuation PropList */
UCD_PROPERTY_OTHER_MATH = 0x00000100, /**< @brief Other_Math PropList */
UCD_PROPERTY_HEX_DIGIT = 0x00000200, /**< @brief Hex_Digit PropList */
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x00000400, /**< @brief ASCII_Hex_Digit PropList */
UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
UCD_PROPERTY_OTHER_LOWERCASE = 0x00008000, /**< @brief Other_Lowercase PropList */
UCD_PROPERTY_OTHER_UPPERCASE = 0x00010000, /**< @brief Other_Uppercase PropList */
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space PropList */
UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> DispositionType (enabled check) */
UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control PropList */
UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control PropList */
UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash PropList */
UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen PropList */
UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark PropList */
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000080, /**< @brief Terminal_Punctuation PropList */
UCD_PROPERTY_OTHER_MATH = 0x00000100, /**< @brief Other_Math PropList */
UCD_PROPERTY_HEX_DIGIT = 0x00000200, /**< @brief Hex_Digit PropList */
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x00000400, /**< @brief ASCII_Hex_Digit PropList */
UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
UCD_PROPERTY_OTHER_LOWERCASE = 0x00008000, /**< @brief Other_Lowercase PropList */
UCD_PROPERTY_OTHER_UPPERCASE = 0x00010000, /**< @brief Other_Uppercase PropList */
UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x00020000, /**< @brief Noncharacter_Code_Point PropList */
} ucd_property;

/** @brief Return the properties of the specified codepoint.
@@ -806,7 +807,7 @@ namespace ucd
enum property
{
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space PropList */
noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> DispositionType (enabled check only) */
noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> DispositionType (enabled check) */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control PropList */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control PropList */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash PropList */
@@ -822,6 +823,7 @@ namespace ucd
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender PropList */
Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase PropList */
Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase PropList */
Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point PropList */
};

/** @brief Return the properties of the specified codepoint.

+ 9
- 1
src/proplist.c View File

@@ -48,6 +48,13 @@ static int properties_Cf(codepoint_t c)
return 0;
}

static int properties_Cn(codepoint_t c)
{
if (c >= 0xFDD0 && c <= 0xFDEF) return UCD_PROPERTY_NONCHARACTER_CODE_POINT;
if ((c & 0x0000FFFF) >= 0xFFFE) return UCD_PROPERTY_NONCHARACTER_CODE_POINT;
return 0;
}

static int properties_Ll(codepoint_t c)
{
switch (c & 0xFFFFFF00)
@@ -1518,6 +1525,7 @@ ucd_property ucd_properties(codepoint_t c, ucd_category category)
{
case UCD_CATEGORY_Cc: return properties_Cc(c);
case UCD_CATEGORY_Cf: return properties_Cf(c);
case UCD_CATEGORY_Cn: return properties_Cn(c);
case UCD_CATEGORY_Ll: return properties_Ll(c);
case UCD_CATEGORY_Lm: return properties_Lm(c);
case UCD_CATEGORY_Lo: return properties_Lo(c) | properties_Lo_ideographic(c);
@@ -1539,6 +1547,6 @@ ucd_property ucd_properties(codepoint_t c, ucd_category category)
case UCD_CATEGORY_Zl: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zp: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zs: return properties_Zs(c);
default: return 0; // Cn Co Cs Ii Lt Me No Sc
default: return 0; // Co Cs Ii Lt Me No Sc
};
}

+ 19
- 18
tools/printdata.py View File

@@ -120,24 +120,25 @@ def decomposition_type(data, dtype):
return None

def properties(data):
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props += 16384 * data.get('Extender', 0)
props += 32768 * data.get('Other_Lowercase', 0)
props += 65536 * data.get('Other_Uppercase', 0)
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props += 16384 * data.get('Extender', 0)
props += 32768 * data.get('Other_Lowercase', 0)
props += 65536 * data.get('Other_Uppercase', 0)
props += 131072 * data.get('Noncharacter_Code_Point', 0)
return props

if __name__ == '__main__':

Loading…
Cancel
Save