| @@ -76,14 +76,17 @@ data/ucd/UnicodeData.txt: | |||
| ############################# libucd ########################################## | |||
| src/case.cpp: tools/case.py tools/ucd.py \ | |||
| supplemental/Klingon.txt \ | |||
| data/ucd/UnicodeData.txt | |||
| tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| src/categories.cpp: tools/categories.py tools/ucd.py \ | |||
| supplemental/Klingon.txt \ | |||
| data/ucd/UnicodeData.txt | |||
| tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||
| supplemental/Klingon.txt \ | |||
| data/language-subtag-registry \ | |||
| data/ucd/Scripts.txt | |||
| tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| @@ -108,8 +111,10 @@ tests_printucddata_SOURCES = tests/printucddata.cpp | |||
| tests_printucddata_LDADD = src/libucd.la | |||
| tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | |||
| supplemental/Klingon.txt \ | |||
| data/ucd/UnicodeData.txt \ | |||
| data/ucd/PropList.txt | |||
| data/ucd/PropList.txt \ | |||
| data/ucd/Scripts.txt | |||
| tools/printdata.py ${UCD_ROOTDIR} > $@ | |||
| tests/unicode-data.actual: tests/printucddata | |||
| @@ -1249,6 +1249,26 @@ static const uint8_t categories_00D700[256] = | |||
| /* F0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Cn, Cn, Cn, Cn, | |||
| }; | |||
| static const uint8_t categories_00F800[256] = | |||
| { | |||
| /* 00 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 10 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 20 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 30 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 40 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 50 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 60 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 70 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 80 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* 90 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* A0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* B0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* C0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
| /* D0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | |||
| /* E0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Co, Co, Co, Co, Co, Co, | |||
| /* F0 */ Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Co, Co, Co, Po, Po, So, | |||
| }; | |||
| static const uint8_t categories_00FA00[256] = | |||
| { | |||
| /* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | |||
| @@ -2429,8 +2449,9 @@ static const uint8_t *categories_000000_00D7FF[] = | |||
| categories_00D700, | |||
| }; | |||
| static const uint8_t *categories_00F900_02FAFF[] = | |||
| static const uint8_t *categories_00F800_02FAFF[] = | |||
| { | |||
| categories_00F800, | |||
| categories_Lo, // 00F900 | |||
| categories_00FA00, | |||
| categories_00FB00, | |||
| @@ -2961,10 +2982,10 @@ ucd::category ucd::lookup_category(codepoint_t c) | |||
| return (ucd::category)table[c % 256]; | |||
| } | |||
| if (c <= 0x00DFFF) return Cs; // 00D800..00DFFF : Surrogates | |||
| if (c <= 0x00F8FF) return Co; // 00E000..00F8FF : Private Use Area | |||
| if (c <= 0x02FAFF) // 00F900..02FAFF | |||
| if (c <= 0x00F7FF) return Co; // 00E000..00F7FF : Private Use Area | |||
| if (c <= 0x02FAFF) // 00F800..02FAFF | |||
| { | |||
| const uint8_t *table = categories_00F900_02FAFF[(c - 0x00F900) / 256]; | |||
| const uint8_t *table = categories_00F800_02FAFF[(c - 0x00F800) / 256]; | |||
| return (ucd::category)table[c % 256]; | |||
| } | |||
| if (c <= 0x0DFFFF) return Cn; // 02FB00..0DFFFF : Unassigned | |||
| @@ -202,6 +202,7 @@ namespace ucd | |||
| Phnx, /**< @brief Phoenician Script */ | |||
| Plrd, /**< @brief Miao Script */ | |||
| Prti, /**< @brief Inscriptional Parthian Script */ | |||
| Qaak, /**< @brief Klingon Script */ | |||
| Rjng, /**< @brief Rejang Script */ | |||
| Runr, /**< @brief Runic Script */ | |||
| Samr, /**< @brief Samaritan Script */ | |||
| @@ -1309,6 +1309,26 @@ static const uint8_t scripts_00D700[256] = | |||
| /* F0 */ Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| }; | |||
| static const uint8_t scripts_00F800[256] = | |||
| { | |||
| /* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* D0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, | |||
| /* E0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
| /* F0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zyyy, Zyyy, Zyyy, | |||
| }; | |||
| static const uint8_t scripts_00FA00[256] = | |||
| { | |||
| /* 00 */ Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, | |||
| @@ -2489,8 +2509,9 @@ static const uint8_t *scripts_000000_00D7FF[] = | |||
| scripts_00D700, | |||
| }; | |||
| static const uint8_t *scripts_00F900_02FAFF[] = | |||
| static const uint8_t *scripts_00F800_02FAFF[] = | |||
| { | |||
| scripts_00F800, | |||
| scripts_Hant, // 00F900 | |||
| scripts_00FA00, | |||
| scripts_00FB00, | |||
| @@ -3020,10 +3041,10 @@ ucd::script ucd::lookup_script(codepoint_t c) | |||
| const uint8_t *table = scripts_000000_00D7FF[(c - 0x000000) / 256]; | |||
| return (ucd::script)table[c % 256]; | |||
| } | |||
| if (c <= 0x00F8FF) return Zzzz; // 00D800..00F8FF : Surrogates / Private Use Area | |||
| if (c <= 0x02FAFF) // 00F900..02FAFF | |||
| if (c <= 0x00F7FF) return Zzzz; // 00D800..00F7FF : Surrogates / Private Use Area | |||
| if (c <= 0x02FAFF) // 00F800..02FAFF | |||
| { | |||
| const uint8_t *table = scripts_00F900_02FAFF[(c - 0x00F900) / 256]; | |||
| const uint8_t *table = scripts_00F800_02FAFF[(c - 0x00F800) / 256]; | |||
| return (ucd::script)table[c % 256]; | |||
| } | |||
| if (c <= 0x0DFFFF) return Zzzz; // 02FB00..0DFFFF : Unassigned | |||
| @@ -0,0 +1,41 @@ | |||
| # Code Point ; Script ; General Category ; Name ; Okrand Transliteration | |||
| F8D0;Qaak;Lo;KLINGON LETTER A;a | |||
| F8D1;Qaak;Lo;KLINGON LETTER B;a | |||
| F8D2;Qaak;Lo;KLINGON LETTER CH;ch | |||
| F8D3;Qaak;Lo;KLINGON LETTER D;D | |||
| F8D4;Qaak;Lo;KLINGON LETTER E;e | |||
| F8D5;Qaak;Lo;KLINGON LETTER GH;gh | |||
| F8D6;Qaak;Lo;KLINGON LETTER H;H | |||
| F8D7;Qaak;Lo;KLINGON LETTER I;I | |||
| F8D8;Qaak;Lo;KLINGON LETTER J;j | |||
| F8D9;Qaak;Lo;KLINGON LETTER L;l | |||
| F8DA;Qaak;Lo;KLINGON LETTER M;m | |||
| F8DB;Qaak;Lo;KLINGON LETTER N;n | |||
| F8DC;Qaak;Lo;KLINGON LETTER NG;ng | |||
| F8DD;Qaak;Lo;KLINGON LETTER O;o | |||
| F8DE;Qaak;Lo;KLINGON LETTER P;p | |||
| F8DF;Qaak;Lo;KLINGON LETTER Q;q | |||
| F8E0;Qaak;Lo;KLINGON LETTER QH;Q | |||
| F8E1;Qaak;Lo;KLINGON LETTER R;r | |||
| F8E2;Qaak;Lo;KLINGON LETTER S;S | |||
| F8E3;Qaak;Lo;KLINGON LETTER T;t | |||
| F8E4;Qaak;Lo;KLINGON LETTER TLH;tlh | |||
| F8E5;Qaak;Lo;KLINGON LETTER U;u | |||
| F8E6;Qaak;Lo;KLINGON LETTER V;v | |||
| F8E7;Qaak;Lo;KLINGON LETTER W;w | |||
| F8E8;Qaak;Lo;KLINGON LETTER Y;y | |||
| F8E9;Qaak;Lo;KLINGON LETTER GLOTTAL STOP;' | |||
| F8F0;Qaak;Nd;KLINGON DIGIT ZERO;0 | |||
| F8F1;Qaak;Nd;KLINGON DIGIT ONE;1 | |||
| F8F2;Qaak;Nd;KLINGON DIGIT TWO;2 | |||
| F8F3;Qaak;Nd;KLINGON DIGIT THREE;3 | |||
| F8F4;Qaak;Nd;KLINGON DIGIT FOUR;4 | |||
| F8F5;Qaak;Nd;KLINGON DIGIT FIVE;5 | |||
| F8F6;Qaak;Nd;KLINGON DIGIT SIX;6 | |||
| F8F7;Qaak;Nd;KLINGON DIGIT SEVEN;7 | |||
| F8F8;Qaak;Nd;KLINGON DIGIT EIGHT;8 | |||
| F8F9;Qaak;Nd;KLINGON DIGIT NINE;9 | |||
| F8FD;Zyyy;Po;KLINGON COMMA;, | |||
| F8FE;Zyyy;Po;KLINGON FULL STOP;. | |||
| F8FF;Zyyy;So;KLINGON MUMMIFICATION GLYPH; | |||
| @@ -154,6 +154,7 @@ const char *get_script_string(ucd::script s) | |||
| case Phnx: return "Phnx"; | |||
| case Plrd: return "Plrd"; | |||
| case Prti: return "Prti"; | |||
| case Qaak: return "Qaak"; | |||
| case Rjng: return "Rjng"; | |||
| case Runr: return "Runr"; | |||
| case Samr: return "Samr"; | |||
| @@ -28,6 +28,9 @@ unicode_chars = {} | |||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
| for codepoint in data['CodePoint']: | |||
| unicode_chars[codepoint] = data['GeneralCategory'] | |||
| for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
| for codepoint in data['CodePoint']: | |||
| unicode_chars[codepoint] = data['GeneralCategory'] | |||
| # This map is a combination of the information in the UnicodeData and Blocks | |||
| # data files. It is intended to reduce the number of character tables that | |||
| @@ -35,8 +38,8 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
| category_sets = [ | |||
| (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'), | |||
| (ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'), | |||
| (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'), | |||
| (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'), | |||
| (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'), | |||
| @@ -35,6 +35,13 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
| for codepoint in data['Range']: | |||
| unicode_chars[codepoint]['Script'] = data['Script'] | |||
| for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
| for codepoint in data['CodePoint']: | |||
| unicode_chars[codepoint] = data | |||
| unicode_chars[codepoint]['Properties'] = [] | |||
| unicode_chars[codepoint]['UpperCase'] = ucd.CodePoint('0000') | |||
| unicode_chars[codepoint]['LowerCase'] = ucd.CodePoint('0000') | |||
| unicode_chars[codepoint]['TitleCase'] = ucd.CodePoint('0000') | |||
| null = ucd.CodePoint('0000') | |||
| if __name__ == '__main__': | |||
| @@ -28,14 +28,17 @@ unicode_chars = {} | |||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
| for codepoint in data['Range']: | |||
| unicode_chars[codepoint] = data['Script'] | |||
| for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
| for codepoint in data['CodePoint']: | |||
| unicode_chars[codepoint] = data['Script'] | |||
| # This map is a combination of the information in the UnicodeData and Blocks | |||
| # data files. It is intended to reduce the number of character tables that | |||
| # need to be generated. | |||
| script_sets = [ | |||
| (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('00D800..00F8FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||
| (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||
| (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||
| (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), | |||
| @@ -147,6 +147,14 @@ data_items = { | |||
| ('LowerCase', codepoint), | |||
| ('TitleCase', codepoint), | |||
| ], | |||
| # Supplemental Data: | |||
| 'Klingon': [ | |||
| ('CodePoint', codepoint), | |||
| ('Script', str), | |||
| ('GeneralCategory', string), | |||
| ('Name', string), | |||
| ('Transliteration', string), | |||
| ], | |||
| } | |||
| def parse_ucd_data(ucd_rootdir, dataset): | |||