Browse Source

Add Extender support from PropList.txt.

master
Reece H. Dunn 8 years ago
parent
commit
735a661232
3 changed files with 67 additions and 23 deletions
  1. 2
    0
      src/include/ucd/ucd.h
  2. 49
    8
      src/proplist.c
  3. 16
    15
      tools/printdata.py

+ 2
- 0
src/include/ucd/ucd.h View File

@@ -343,6 +343,7 @@ typedef enum ucd_property_
UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
} ucd_property;

/** @brief Return the properties of the specified codepoint.
@@ -816,6 +817,7 @@ namespace ucd
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic PropList */
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic PropList */
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic PropList */
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender PropList */
};

/** @brief Return the properties of the specified codepoint.

+ 49
- 8
src/proplist.c View File

@@ -110,7 +110,8 @@ static int properties_Lm(codepoint_t c)
{
case 0x0200:
if (c >= 0x02B0 && c <= 0x02C1) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02C6 && c <= 0x02D1) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02C6 && c <= 0x02CF) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02D0 && c <= 0x02D1) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0x02E0 && c <= 0x02E4) return UCD_PROPERTY_DIACRITIC;
if (c == 0x02EC) return UCD_PROPERTY_DIACRITIC;
if (c == 0x02EE) return UCD_PROPERTY_DIACRITIC;
@@ -123,15 +124,28 @@ static int properties_Lm(codepoint_t c)
if (c == 0x0559) return UCD_PROPERTY_DIACRITIC;
break;
case 0x0600:
if (c == 0x0640) return UCD_PROPERTY_EXTENDER;
if (c >= 0x06E5 && c <= 0x06E6) return UCD_PROPERTY_DIACRITIC;
break;
case 0x0700:
if (c >= 0x07F4 && c <= 0x07F5) return UCD_PROPERTY_DIACRITIC;
if (c == 0x07FA) return UCD_PROPERTY_EXTENDER;
break;
case 0x0900:
if (c == 0x0971) return UCD_PROPERTY_DIACRITIC;
break;
case 0x0E00:
if (c == 0x0E46) return UCD_PROPERTY_EXTENDER;
if (c == 0x0EC6) return UCD_PROPERTY_EXTENDER;
break;
case 0x1800:
if (c == 0x1843) return UCD_PROPERTY_EXTENDER;
break;
case 0x1A00:
if (c == 0x1AA7) return UCD_PROPERTY_EXTENDER;
break;
case 0x1C00:
if (c == 0x1C7B) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0x1C78 && c <= 0x1C7D) return UCD_PROPERTY_DIACRITIC;
break;
case 0x1D00:
@@ -141,9 +155,17 @@ static int properties_Lm(codepoint_t c)
if (c == 0x2E2F) return UCD_PROPERTY_DIACRITIC;
break;
case 0x3000:
if (c == 0x30FC) return UCD_PROPERTY_DIACRITIC;
if (c == 0x3005) return UCD_PROPERTY_EXTENDER;
if (c >= 0x3031 && c <= 0x3035) return UCD_PROPERTY_EXTENDER;
if (c >= 0x309D && c <= 0x309E) return UCD_PROPERTY_EXTENDER;
if (c == 0x30FC) return UCD_PROPERTY_EXTENDER | UCD_PROPERTY_DIACRITIC;
if (c >= 0x30FD && c <= 0x30FE) return UCD_PROPERTY_EXTENDER;
break;
case 0xA000:
if (c == 0xA015) return UCD_PROPERTY_EXTENDER;
break;
case 0xA600:
if (c == 0xA60C) return UCD_PROPERTY_EXTENDER;
if (c == 0xA67F) return UCD_PROPERTY_DIACRITIC;
if (c >= 0xA69C && c <= 0xA69D) return UCD_PROPERTY_DIACRITIC;
break;
@@ -152,15 +174,28 @@ static int properties_Lm(codepoint_t c)
if (c == 0xA788) return UCD_PROPERTY_DIACRITIC;
if (c >= 0xA7F8 && c <= 0xA7F9) return UCD_PROPERTY_DIACRITIC;
break;
case 0xA900:
if (c == 0xA9CF) return UCD_PROPERTY_EXTENDER;
if (c == 0xA9E6) return UCD_PROPERTY_EXTENDER;
break;
case 0xAA00:
if (c == 0xAA70) return UCD_PROPERTY_EXTENDER;
if (c == 0xAADD) return UCD_PROPERTY_EXTENDER;
if (c >= 0xAAF3 && c <= 0xAAF4) return UCD_PROPERTY_EXTENDER;
break;
case 0xAB00:
if (c >= 0xAB5C && c <= 0xAB5F) return UCD_PROPERTY_DIACRITIC;
break;
case 0xFF00:
if (c == 0xFF70) return UCD_PROPERTY_DIACRITIC;
if (c == 0xFF70) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0xFF9E && c <= 0xFF9F) return UCD_PROPERTY_DIACRITIC;
break;
case 0x016B00:
if (c >= 0x016B42 && c <= 0x016B43) return UCD_PROPERTY_EXTENDER;
break;
case 0x016F00:
if (c >= 0x016F93 && c <= 0x016F9F) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x016F93 && c <= 0x016F9F) return UCD_PROPERTY_DIACRITIC;
if (c == 0x016FE0) return UCD_PROPERTY_EXTENDER;
break;
}
return 0;
@@ -180,7 +215,10 @@ static int properties_Lo(codepoint_t c)
if (c == 0xAAC0) return UCD_PROPERTY_DIACRITIC;
if (c == 0xAAC2) return UCD_PROPERTY_DIACRITIC;
break;
case 0x01EE00:
case 0x11300:
if (c == 0x01135D) return UCD_PROPERTY_EXTENDER;
break;
case 0x1EE00:
return UCD_PROPERTY_OTHER_MATH;
}
return 0;
@@ -718,7 +756,8 @@ static int properties_Mn(codepoint_t c)
break;
case 0x1C00:
if (c >= 0x1C2C && c <= 0x1C33) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x1C36 && c <= 0x1C37) return UCD_PROPERTY_DIACRITIC;
if (c == 0x1C36) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c == 0x1C37) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CD0 && c <= 0x1CD2) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CD4 && c <= 0x1CE0) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CE2 && c <= 0x1CE8) return UCD_PROPERTY_DIACRITIC;
@@ -913,7 +952,7 @@ static int properties_Mn(codepoint_t c)
if (c >= 0x01E8D0 && c <= 0x01E8D6) return UCD_PROPERTY_DIACRITIC;
break;
case 0x01E900:
if (c >= 0x01E944 && c <= 0x01E946) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x01E944 && c <= 0x01E946) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c == 0x01E947) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x01E948 && c <= 0x01E94A) return UCD_PROPERTY_DIACRITIC;
break;
@@ -1074,7 +1113,7 @@ static int properties_Po(codepoint_t c)
if (c == 0x002E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x003A && c <= 0x003B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x003F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x00B7) return UCD_PROPERTY_DIACRITIC;
if (c == 0x00B7) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
break;
case 0x0300:
if (c == 0x037E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
@@ -1128,6 +1167,7 @@ static int properties_Po(codepoint_t c)
case 0x1800:
if (c >= 0x1802 && c <= 0x1805) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x1808 && c <= 0x1809) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x180A) return UCD_PROPERTY_EXTENDER;
break;
case 0x1900:
if (c >= 0x1944 && c <= 0x1945) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
@@ -1241,6 +1281,7 @@ static int properties_Po(codepoint_t c)
break;
case 0x11500:
if (c >= 0x0115C2 && c <= 0x0115C5) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0115C6 && c <= 0x0115C8) return UCD_PROPERTY_EXTENDER;
if (c >= 0x0115C9 && c <= 0x0115D7) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x11600:

+ 16
- 15
tools/printdata.py View File

@@ -120,21 +120,22 @@ def decomposition_type(data, dtype):
return None

def properties(data):
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props += 16384 * data.get('Extender', 0)
return props

if __name__ == '__main__':

Loading…
Cancel
Save