Browse Source

Add Terminal_Punctuation support from PropList.txt.

master
Reece H. Dunn 8 years ago
parent
commit
4226457fb5
3 changed files with 204 additions and 23 deletions
  1. 16
    14
      src/include/ucd/ucd.h
  2. 179
    1
      src/proplist.c
  3. 9
    8
      tools/printdata.py

+ 16
- 14
src/include/ucd/ucd.h View File

@@ -329,13 +329,14 @@ ucd_script ucd_lookup_script(codepoint_t c);
*/
typedef enum ucd_property_
{
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space */
UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> */
UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control */
UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control */
UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash */
UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen */
UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark */
UCD_PROPERTY_WHITE_SPACE = 0x00000001, /**< @brief White_Space PropList */
UCD_PROPERTY_NO_BREAK = 0x00000002, /**< @brief <noBreak> DispositionType (enabled check only) */
UCD_PROPERTY_BIDI_CONTROL = 0x00000004, /**< @brief Bidi_Control PropList */
UCD_PROPERTY_JOIN_CONTROL = 0x00000008, /**< @brief Join_Control PropList */
UCD_PROPERTY_DASH = 0x00000010, /**< @brief Dash PropList */
UCD_PROPERTY_HYPHEN = 0x00000020, /**< @brief Hyphen PropList */
UCD_PROPERTY_QUOTATION_MARK = 0x00000040, /**< @brief Quotation_Mark PropList */
UCD_PROPERTY_TERMINAL_PUNCTUATION = 0x00000080, /**< @brief Terminal_Punctuation PropList */
} ucd_property;

/** @brief Return the properties of the specified codepoint.
@@ -795,13 +796,14 @@ namespace ucd
*/
enum property
{
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space */
noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash */
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen */
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark */
White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space PropList */
noBreak = UCD_PROPERTY_NO_BREAK, /**< @brief <noBreak> DispositionType (enabled check only) */
Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control PropList */
Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control PropList */
Dash = UCD_PROPERTY_DASH, /**< @brief Dash PropList */
Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen PropList */
Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark PropList */
Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation PropList */
};

/** @brief Return the properties of the specified codepoint.

+ 179
- 1
src/proplist.c View File

@@ -135,22 +135,200 @@ static int properties_Po(codepoint_t c)
switch (c & 0xFFFFFF00)
{
case 0x0000:
if (c == 0x0021) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0022) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0x0027) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0x002C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x002E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x003A && c <= 0x003B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x003F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0300:
if (c == 0x037E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0387) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0500:
if (c == 0x0589) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x05C3) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0600:
if (c == 0x060C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x061B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x061F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x06D4) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0700:
if (c >= 0x0700 && c <= 0x070A) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x070C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x07F8 && c <= 0x07F9) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0800:
if (c >= 0x0830 && c <= 0x083E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x085E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0900:
if (c >= 0x0964 && c <= 0x0965) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0E00:
if (c >= 0x0E5A && c <= 0x0E5B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0F00:
if (c == 0x0F08) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0F0C) return UCD_PROPERTY_NO_BREAK;
if (c >= 0x0F0D && c <= 0x0F12) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1000:
if (c >= 0x104A && c <= 0x104B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1300:
if (c >= 0x1361 && c <= 0x1368) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1600:
if (c >= 0x166D && c <= 0x166E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x16EB && c <= 0x16ED) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1700:
if (c >= 0x1735 && c <= 0x1736) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x17D4 && c <= 0x17D6) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x17DA) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1800:
if (c >= 0x1802 && c <= 0x1805) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x1808 && c <= 0x1809) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1900:
if (c >= 0x1944 && c <= 0x1945) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1A00:
if (c >= 0x1AA8 && c <= 0x1AAB) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1B00:
if (c >= 0x1B5A && c <= 0x1B5B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x1B5D && c <= 0x1B5F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1C00:
if (c >= 0x1C3B && c <= 0x1C3F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x1C7E && c <= 0x1C7F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x2000:
if (c >= 0x203C && c <= 0x203D) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x2047 && c <= 0x2049) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x2053) return UCD_PROPERTY_DASH;
break;
case 0x2E00:
if (c == 0x2E2E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x2E3C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x2E41) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x3000:
if (c >= 0x3001 && c <= 0x3002) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x30FB) return UCD_PROPERTY_HYPHEN;
break;
case 0xA400:
if (c >= 0xA4FE && c <= 0xA4FF) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xA600:
if (c >= 0xA60D && c <= 0xA60F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0xA6F3 && c <= 0xA6F7) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xA800:
if (c >= 0xA876 && c <= 0xA877) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0xA8CE && c <= 0xA8CF) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xA900:
if (c == 0xA92F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0xA9C7 && c <= 0xA9C9) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xAA00:
if (c >= 0xAA5D && c <= 0xAA5F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xAADF) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0xAAF0 && c <= 0xAAF1) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xAB00:
if (c == 0xABEB) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xFE00:
if (c >= 0xFE50 && c <= 0xFE52) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0xFE54 && c <= 0xFE57) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0xFF00:
if (c == 0xFF01) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF02) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0xFF07) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0xFF0C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF0E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF65) return UCD_PROPERTY_HYPHEN;
if (c >= 0xFF1A && c <= 0xFF1B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF1F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF61) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0xFF64) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x10300:
if (c == 0x01039F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0103D0) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x10800:
if (c == 0x010857) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x10900:
if (c == 0x01091F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x10A00:
if (c >= 0x010A56 && c <= 0x010A57) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x010AF0 && c <= 0x010AF5) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x10B00:
if (c >= 0x010B3A && c <= 0x010B3F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x010B99 && c <= 0x010B9C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11000:
if (c >= 0x011047 && c <= 0x01104D) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0110BE && c <= 0x0110C1) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11100:
if (c >= 0x011141 && c <= 0x011143) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0111C5 && c <= 0x0111C6) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0111CD) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0111DE && c <= 0x0111DF) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11200:
if (c >= 0x011238 && c <= 0x01123C) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0112A9) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11400:
if (c >= 0x01144B && c <= 0x01144D) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x01145B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11500:
if (c >= 0x0115C2 && c <= 0x0115C5) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0115C9 && c <= 0x0115D7) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11600:
if (c >= 0x011641 && c <= 0x011642) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11700:
if (c >= 0x01173C && c <= 0x01173E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11C00:
if (c >= 0x011C41 && c <= 0x011C43) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x011C71) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x12400:
if (c >= 0x012470 && c <= 0x012474) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x16A00:
if (c >= 0x016A6E && c <= 0x016A6F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x016AF5) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x16B00:
if (c >= 0x016B37 && c <= 0x016B39) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x016B44) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1BC00:
if (c == 0x01BC9F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x1DA00:
if (c >= 0x01DA87 && c <= 0x01DA8A) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
}
return 0;
@@ -226,6 +404,6 @@ ucd_property ucd_properties(codepoint_t c, ucd_category category)
case UCD_CATEGORY_Zl: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zp: return UCD_PROPERTY_WHITE_SPACE;
case UCD_CATEGORY_Zs: return properties_Zs(c);
default: return 0;
default: return 0; // Cn Co Cs Ii L& M& N& Pc Sc Sk So
};
}

+ 9
- 8
tools/printdata.py View File

@@ -120,14 +120,15 @@ def decomposition_type(data, dtype):
return None

def properties(data):
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
return props

if __name__ == '__main__':

Loading…
Cancel
Save