Browse Source

Remove noBreak from ucd_properties -- it is a DispositionType, not a PropList type.

master
Reece H. Dunn 8 years ago
parent
commit
6553494c5c
4 changed files with 82 additions and 73 deletions
  1. 28
    2
      src/ctype.c
  2. 34
    36
      src/include/ucd/ucd.h
  3. 2
    16
      src/proplist.c
  4. 18
    19
      tools/printdata.py

+ 28
- 2
src/ctype.c View File

@@ -437,8 +437,34 @@ int ucd_ispunct(codepoint_t c)

int ucd_isspace(codepoint_t c)
{
ucd_property props = ucd_properties(c, ucd_lookup_category(c));
return (props & (UCD_PROPERTY_WHITE_SPACE | UCD_PROPERTY_NO_BREAK)) == UCD_PROPERTY_WHITE_SPACE;
switch (ucd_lookup_category(c))
{
case UCD_CATEGORY_Zl:
case UCD_CATEGORY_Zp:
return 1;
case UCD_CATEGORY_Zs:
switch (c) // Exclude characters with the <noBreak> DispositionType
{
case 0x00A0: // U+00A0 : NO-BREAK SPACE
case 0x2007: // U+2007 : FIGURE SPACE
case 0x202F: // U+202F : NARROW NO-BREAK SPACE
return 0;
}
return 1;
case UCD_CATEGORY_Cc:
switch (c) // Include control characters marked as White_Space
{
case 0x09: // U+0009 : CHARACTER TABULATION
case 0x0A: // U+000A : LINE FEED
case 0x0B: // U+000B : LINE TABULATION
case 0x0C: // U+000C : FORM FEED
case 0x0D: // U+000D : CARRIAGE RETURN
case 0x85: // U+0085 : NEXT LINE
return 1;
}
default:
return 0;
}
}

int ucd_isupper(codepoint_t c)

+ 34
- 36
src/include/ucd/ucd.h View File

@@ -329,24 +329,23 @@ ucd_script ucd_lookup_script(codepoint_t c);
*/
typedef enum ucd_property_
{
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space PropList */
UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> DispositionType (enabled check) */
UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control PropList */
UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control PropList */
UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash PropList */
UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen PropList */
UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark PropList */
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000080, /**< @brief Terminal_Punctuation PropList */
UCD_PROPERTY_OTHER_MATH = 0x00000100, /**< @brief Other_Math PropList */
UCD_PROPERTY_HEX_DIGIT = 0x00000200, /**< @brief Hex_Digit PropList */
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x00000400, /**< @brief ASCII_Hex_Digit PropList */
UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
UCD_PROPERTY_OTHER_LOWERCASE = 0x00008000, /**< @brief Other_Lowercase PropList */
UCD_PROPERTY_OTHER_UPPERCASE = 0x00010000, /**< @brief Other_Uppercase PropList */
UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x00020000, /**< @brief Noncharacter_Code_Point PropList */
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space */
UCD_PROPERTY_BIDI_CONTROL = 0x00000002, /**< @brief Bidi_Control */
UCD_PROPERTY_JOIN_CONTROL = 0x00000004, /**< @brief Join_Control */
UCD_PROPERTY_DASH = 0x00000008, /**< @brief Dash */
UCD_PROPERTY_HYPHEN = 0x00000010, /**< @brief Hyphen */
UCD_PROPERTY_QUOTATION_MARK = 0x00000020, /**< @brief Quotation_Mark */
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000040, /**< @brief Terminal_Punctuation */
UCD_PROPERTY_OTHER_MATH = 0x00000080, /**< @brief Other_Math */
UCD_PROPERTY_HEX_DIGIT = 0x00000100, /**< @brief Hex_Digit */
UCD_PROPERTY_ASCII_HEX_DIGIT = 0x00000200, /**< @brief ASCII_Hex_Digit */
UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000400, /**< @brief Other_Alphabetic */
UCD_PROPERTY_IDEOGRAPHIC = 0x00000800, /**< @brief Ideographic */
UCD_PROPERTY_DIACRITIC = 0x00001000, /**< @brief Diacritic */
UCD_PROPERTY_EXTENDER = 0x00002000, /**< @brief Extender */
UCD_PROPERTY_OTHER_LOWERCASE = 0x00004000, /**< @brief Other_Lowercase */
UCD_PROPERTY_OTHER_UPPERCASE = 0x00008000, /**< @brief Other_Uppercase */
UCD_PROPERTY_NONCHARACTER_CODE_POINT = 0x00010000, /**< @brief Noncharacter_Code_Point */
} ucd_property;

/** @brief Return the properties of the specified codepoint.
@@ -806,24 +805,23 @@ namespace ucd
*/
enum property
{
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space PropList */
noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> DispositionType (enabled check) */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control PropList */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control PropList */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash PropList */
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen PropList */
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark PropList */
Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation PropList */
Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math PropList */
Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit PropList */
ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit PropList */
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic PropList */
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic PropList */
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic PropList */
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender PropList */
Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase PropList */
Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase PropList */
Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point PropList */
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash */
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen */
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark */
Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation */
Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math */
Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit */
ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit */
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic */
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic */
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic */
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender */
Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase */
Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase */
Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point */
};

/** @brief Return the properties of the specified codepoint.

+ 2
- 16
src/proplist.c View File

@@ -1043,8 +1043,7 @@ static int properties_Pd(codepoint_t c)
if (c == 0x1806) return UCD_PROPERTY_DASH | UCD_PROPERTY_HYPHEN;
break;
case 0x2000:
if (c == 0x2010) return UCD_PROPERTY_DASH | UCD_PROPERTY_HYPHEN;
if (c == 0x2011) return UCD_PROPERTY_DASH | UCD_PROPERTY_HYPHEN | UCD_PROPERTY_NO_BREAK;
if (c >= 0x2010 && c <= 0x2011) return UCD_PROPERTY_DASH | UCD_PROPERTY_HYPHEN;
break;
case 0x2E00:
if (c == 0x2E17) return UCD_PROPERTY_DASH | UCD_PROPERTY_HYPHEN;
@@ -1174,7 +1173,6 @@ static int properties_Po(codepoint_t c)
break;
case 0x0F00:
if (c == 0x0F08) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0F0C) return UCD_PROPERTY_NO_BREAK;
if (c >= 0x0F0D && c <= 0x0F12) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1000:
@@ -1507,18 +1505,6 @@ static int properties_So(codepoint_t c)
return 0;
}

static int properties_Zs(codepoint_t c)
{
switch (c)
{
case 0x00A0: // NO-BREAK SPACE
case 0x2007: // FIGURE SPACE
case 0x202F: // NARROW NO-BREAK SPACE
return UCD_PROPERTY_WHITE_SPACE | UCD_PROPERTY_NO_BREAK;
}
return UCD_PROPERTY_WHITE_SPACE;
}

ucd_property ucd_properties(codepoint_t c, ucd_category category)
{
switch (category)
@@ -1546,7 +1532,7 @@ ucd_property ucd_properties(codepoint_t c, ucd_category category)
case UCD_CATEGORY_So: return properties_So(c);
case UCD_CATEGORY_Zl: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zp: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zs: return properties_Zs(c);
case UCD_CATEGORY_Zs: return UCD_PROPERTY_WHITE_SPACE;
default: return 0; // Co Cs Ii Lt Me No Sc
};
}

+ 18
- 19
tools/printdata.py View File

@@ -120,25 +120,24 @@ def decomposition_type(data, dtype):
return None

def properties(data):
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props += 16384 * data.get('Extender', 0)
props += 32768 * data.get('Other_Lowercase', 0)
props += 65536 * data.get('Other_Uppercase', 0)
props += 131072 * data.get('Noncharacter_Code_Point', 0)
props = 0
props += (2 ** 0) * data.get('White_Space', 0)
props += (2 ** 1) * data.get('Bidi_Control', 0)
props += (2 ** 2) * data.get('Join_Control', 0)
props += (2 ** 3) * data.get('Dash', 0)
props += (2 ** 4) * data.get('Hyphen', 0)
props += (2 ** 5) * data.get('Quotation_Mark', 0)
props += (2 ** 6) * data.get('Terminal_Punctuation', 0)
props += (2 ** 7) * data.get('Other_Math', 0)
props += (2 ** 8) * data.get('Hex_Digit', 0)
props += (2 ** 9) * data.get('ASCII_Hex_Digit', 0)
props += (2 ** 10) * data.get('Other_Alphabetic', 0)
props += (2 ** 11) * data.get('Ideographic', 0)
props += (2 ** 12) * data.get('Diacritic', 0)
props += (2 ** 13) * data.get('Extender', 0)
props += (2 ** 14) * data.get('Other_Lowercase', 0)
props += (2 ** 15) * data.get('Other_Uppercase', 0)
props += (2 ** 16) * data.get('Noncharacter_Code_Point', 0)
return props

if __name__ == '__main__':

Loading…
Cancel
Save