Browse Source

Add Extender support from PropList.txt.

master
Reece H. Dunn 8 years ago
parent
commit
735a661232
3 changed files with 67 additions and 23 deletions
  1. 2
    0
      src/include/ucd/ucd.h
  2. 49
    8
      src/proplist.c
  3. 16
    15
      tools/printdata.py

+ 2
- 0
src/include/ucd/ucd.h View File

UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */ UCD_PROPERTY_OTHER_ALPHABETIC = 0x00000800, /**< @brief Other_Alphabetic PropList */
UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */ UCD_PROPERTY_IDEOGRAPHIC = 0x00001000, /**< @brief Ideographic PropList */
UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */ UCD_PROPERTY_DIACRITIC = 0x00002000, /**< @brief Diacritic PropList */
UCD_PROPERTY_EXTENDER = 0x00004000, /**< @brief Extender PropList */
} ucd_property; } ucd_property;


/** @brief Return the properties of the specified codepoint. /** @brief Return the properties of the specified codepoint.
Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic PropList */ Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic PropList */
Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic PropList */ Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic PropList */
Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic PropList */ Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic PropList */
Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender PropList */
}; };


/** @brief Return the properties of the specified codepoint. /** @brief Return the properties of the specified codepoint.

+ 49
- 8
src/proplist.c View File

{ {
case 0x0200: case 0x0200:
if (c >= 0x02B0 && c <= 0x02C1) return UCD_PROPERTY_DIACRITIC; if (c >= 0x02B0 && c <= 0x02C1) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02C6 && c <= 0x02D1) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02C6 && c <= 0x02CF) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x02D0 && c <= 0x02D1) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0x02E0 && c <= 0x02E4) return UCD_PROPERTY_DIACRITIC; if (c >= 0x02E0 && c <= 0x02E4) return UCD_PROPERTY_DIACRITIC;
if (c == 0x02EC) return UCD_PROPERTY_DIACRITIC; if (c == 0x02EC) return UCD_PROPERTY_DIACRITIC;
if (c == 0x02EE) return UCD_PROPERTY_DIACRITIC; if (c == 0x02EE) return UCD_PROPERTY_DIACRITIC;
if (c == 0x0559) return UCD_PROPERTY_DIACRITIC; if (c == 0x0559) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x0600: case 0x0600:
if (c == 0x0640) return UCD_PROPERTY_EXTENDER;
if (c >= 0x06E5 && c <= 0x06E6) return UCD_PROPERTY_DIACRITIC; if (c >= 0x06E5 && c <= 0x06E6) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x0700: case 0x0700:
if (c >= 0x07F4 && c <= 0x07F5) return UCD_PROPERTY_DIACRITIC; if (c >= 0x07F4 && c <= 0x07F5) return UCD_PROPERTY_DIACRITIC;
if (c == 0x07FA) return UCD_PROPERTY_EXTENDER;
break; break;
case 0x0900: case 0x0900:
if (c == 0x0971) return UCD_PROPERTY_DIACRITIC; if (c == 0x0971) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x0E00:
if (c == 0x0E46) return UCD_PROPERTY_EXTENDER;
if (c == 0x0EC6) return UCD_PROPERTY_EXTENDER;
break;
case 0x1800:
if (c == 0x1843) return UCD_PROPERTY_EXTENDER;
break;
case 0x1A00:
if (c == 0x1AA7) return UCD_PROPERTY_EXTENDER;
break;
case 0x1C00: case 0x1C00:
if (c == 0x1C7B) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0x1C78 && c <= 0x1C7D) return UCD_PROPERTY_DIACRITIC; if (c >= 0x1C78 && c <= 0x1C7D) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x1D00: case 0x1D00:
if (c == 0x2E2F) return UCD_PROPERTY_DIACRITIC; if (c == 0x2E2F) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x3000: case 0x3000:
if (c == 0x30FC) return UCD_PROPERTY_DIACRITIC;
if (c == 0x3005) return UCD_PROPERTY_EXTENDER;
if (c >= 0x3031 && c <= 0x3035) return UCD_PROPERTY_EXTENDER;
if (c >= 0x309D && c <= 0x309E) return UCD_PROPERTY_EXTENDER;
if (c == 0x30FC) return UCD_PROPERTY_EXTENDER | UCD_PROPERTY_DIACRITIC;
if (c >= 0x30FD && c <= 0x30FE) return UCD_PROPERTY_EXTENDER;
break;
case 0xA000:
if (c == 0xA015) return UCD_PROPERTY_EXTENDER;
break; break;
case 0xA600: case 0xA600:
if (c == 0xA60C) return UCD_PROPERTY_EXTENDER;
if (c == 0xA67F) return UCD_PROPERTY_DIACRITIC; if (c == 0xA67F) return UCD_PROPERTY_DIACRITIC;
if (c >= 0xA69C && c <= 0xA69D) return UCD_PROPERTY_DIACRITIC; if (c >= 0xA69C && c <= 0xA69D) return UCD_PROPERTY_DIACRITIC;
break; break;
if (c == 0xA788) return UCD_PROPERTY_DIACRITIC; if (c == 0xA788) return UCD_PROPERTY_DIACRITIC;
if (c >= 0xA7F8 && c <= 0xA7F9) return UCD_PROPERTY_DIACRITIC; if (c >= 0xA7F8 && c <= 0xA7F9) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0xA900:
if (c == 0xA9CF) return UCD_PROPERTY_EXTENDER;
if (c == 0xA9E6) return UCD_PROPERTY_EXTENDER;
break;
case 0xAA00:
if (c == 0xAA70) return UCD_PROPERTY_EXTENDER;
if (c == 0xAADD) return UCD_PROPERTY_EXTENDER;
if (c >= 0xAAF3 && c <= 0xAAF4) return UCD_PROPERTY_EXTENDER;
break;
case 0xAB00: case 0xAB00:
if (c >= 0xAB5C && c <= 0xAB5F) return UCD_PROPERTY_DIACRITIC; if (c >= 0xAB5C && c <= 0xAB5F) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0xFF00: case 0xFF00:
if (c == 0xFF70) return UCD_PROPERTY_DIACRITIC;
if (c == 0xFF70) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c >= 0xFF9E && c <= 0xFF9F) return UCD_PROPERTY_DIACRITIC; if (c >= 0xFF9E && c <= 0xFF9F) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x016B00:
if (c >= 0x016B42 && c <= 0x016B43) return UCD_PROPERTY_EXTENDER;
break;
case 0x016F00: case 0x016F00:
if (c >= 0x016F93 && c <= 0x016F9F) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x016F93 && c <= 0x016F9F) return UCD_PROPERTY_DIACRITIC;
if (c == 0x016FE0) return UCD_PROPERTY_EXTENDER;
break; break;
} }
return 0; return 0;
if (c == 0xAAC0) return UCD_PROPERTY_DIACRITIC; if (c == 0xAAC0) return UCD_PROPERTY_DIACRITIC;
if (c == 0xAAC2) return UCD_PROPERTY_DIACRITIC; if (c == 0xAAC2) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x01EE00:
case 0x11300:
if (c == 0x01135D) return UCD_PROPERTY_EXTENDER;
break;
case 0x1EE00:
return UCD_PROPERTY_OTHER_MATH; return UCD_PROPERTY_OTHER_MATH;
} }
return 0; return 0;
break; break;
case 0x1C00: case 0x1C00:
if (c >= 0x1C2C && c <= 0x1C33) return UCD_PROPERTY_OTHER_ALPHABETIC; if (c >= 0x1C2C && c <= 0x1C33) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x1C36 && c <= 0x1C37) return UCD_PROPERTY_DIACRITIC;
if (c == 0x1C36) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c == 0x1C37) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CD0 && c <= 0x1CD2) return UCD_PROPERTY_DIACRITIC; if (c >= 0x1CD0 && c <= 0x1CD2) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CD4 && c <= 0x1CE0) return UCD_PROPERTY_DIACRITIC; if (c >= 0x1CD4 && c <= 0x1CE0) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x1CE2 && c <= 0x1CE8) return UCD_PROPERTY_DIACRITIC; if (c >= 0x1CE2 && c <= 0x1CE8) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x01E8D0 && c <= 0x01E8D6) return UCD_PROPERTY_DIACRITIC; if (c >= 0x01E8D0 && c <= 0x01E8D6) return UCD_PROPERTY_DIACRITIC;
break; break;
case 0x01E900: case 0x01E900:
if (c >= 0x01E944 && c <= 0x01E946) return UCD_PROPERTY_DIACRITIC;
if (c >= 0x01E944 && c <= 0x01E946) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
if (c == 0x01E947) return UCD_PROPERTY_OTHER_ALPHABETIC; if (c == 0x01E947) return UCD_PROPERTY_OTHER_ALPHABETIC;
if (c >= 0x01E948 && c <= 0x01E94A) return UCD_PROPERTY_DIACRITIC; if (c >= 0x01E948 && c <= 0x01E94A) return UCD_PROPERTY_DIACRITIC;
break; break;
if (c == 0x002E) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c == 0x002E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x003A && c <= 0x003B) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x003A && c <= 0x003B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x003F) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c == 0x003F) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x00B7) return UCD_PROPERTY_DIACRITIC;
if (c == 0x00B7) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER;
break; break;
case 0x0300: case 0x0300:
if (c == 0x037E) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c == 0x037E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
case 0x1800: case 0x1800:
if (c >= 0x1802 && c <= 0x1805) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x1802 && c <= 0x1805) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x1808 && c <= 0x1809) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x1808 && c <= 0x1809) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x180A) return UCD_PROPERTY_EXTENDER;
break; break;
case 0x1900: case 0x1900:
if (c >= 0x1944 && c <= 0x1945) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x1944 && c <= 0x1945) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break; break;
case 0x11500: case 0x11500:
if (c >= 0x0115C2 && c <= 0x0115C5) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x0115C2 && c <= 0x0115C5) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c >= 0x0115C6 && c <= 0x0115C8) return UCD_PROPERTY_EXTENDER;
if (c >= 0x0115C9 && c <= 0x0115D7) return UCD_PROPERTY_TERMINAL_PUNCTUATION; if (c >= 0x0115C9 && c <= 0x0115D7) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break; break;
case 0x11600: case 0x11600:

+ 16
- 15
tools/printdata.py View File

return None return None


def properties(data): def properties(data):
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props = 0
props += 1 * data.get('White_Space', 0)
props += 2 * (decomposition_type(data, '<noBreak>') != None)
props += 4 * data.get('Bidi_Control', 0)
props += 8 * data.get('Join_Control', 0)
props += 16 * data.get('Dash', 0)
props += 32 * data.get('Hyphen', 0)
props += 64 * data.get('Quotation_Mark', 0)
props += 128 * data.get('Terminal_Punctuation', 0)
props += 256 * data.get('Other_Math', 0)
props += 512 * data.get('Hex_Digit', 0)
props += 1024 * data.get('ASCII_Hex_Digit', 0)
props += 2048 * data.get('Other_Alphabetic', 0)
props += 4096 * data.get('Ideographic', 0)
props += 8192 * data.get('Diacritic', 0)
props += 16384 * data.get('Extender', 0)
return props return props


if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save