@@ -76,14 +76,17 @@ data/ucd/UnicodeData.txt: | |||
############################# libucd ########################################## | |||
src/case.cpp: tools/case.py tools/ucd.py \ | |||
supplemental/Klingon.txt \ | |||
data/ucd/UnicodeData.txt | |||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
src/categories.cpp: tools/categories.py tools/ucd.py \ | |||
supplemental/Klingon.txt \ | |||
data/ucd/UnicodeData.txt | |||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||
supplemental/Klingon.txt \ | |||
data/language-subtag-registry \ | |||
data/ucd/Scripts.txt | |||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
@@ -108,8 +111,10 @@ tests_printucddata_SOURCES = tests/printucddata.cpp | |||
tests_printucddata_LDADD = src/libucd.la | |||
tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | |||
supplemental/Klingon.txt \ | |||
data/ucd/UnicodeData.txt \ | |||
data/ucd/PropList.txt | |||
data/ucd/PropList.txt \ | |||
data/ucd/Scripts.txt | |||
tools/printdata.py ${UCD_ROOTDIR} > $@ | |||
tests/unicode-data.actual: tests/printucddata |
@@ -1249,6 +1249,26 @@ static const uint8_t categories_00D700[256] = | |||
/* F0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Cn, Cn, Cn, Cn, | |||
}; | |||
static const uint8_t categories_00F800[256] = | |||
{ | |||
/* 00 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 10 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 20 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 30 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 40 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 50 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 60 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 70 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 80 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* 90 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* A0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* B0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* C0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||
/* D0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | |||
/* E0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Co, Co, Co, Co, Co, Co, | |||
/* F0 */ Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Co, Co, Co, Po, Po, So, | |||
}; | |||
static const uint8_t categories_00FA00[256] = | |||
{ | |||
/* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | |||
@@ -2429,8 +2449,9 @@ static const uint8_t *categories_000000_00D7FF[] = | |||
categories_00D700, | |||
}; | |||
static const uint8_t *categories_00F900_02FAFF[] = | |||
static const uint8_t *categories_00F800_02FAFF[] = | |||
{ | |||
categories_00F800, | |||
categories_Lo, // 00F900 | |||
categories_00FA00, | |||
categories_00FB00, | |||
@@ -2961,10 +2982,10 @@ ucd::category ucd::lookup_category(codepoint_t c) | |||
return (ucd::category)table[c % 256]; | |||
} | |||
if (c <= 0x00DFFF) return Cs; // 00D800..00DFFF : Surrogates | |||
if (c <= 0x00F8FF) return Co; // 00E000..00F8FF : Private Use Area | |||
if (c <= 0x02FAFF) // 00F900..02FAFF | |||
if (c <= 0x00F7FF) return Co; // 00E000..00F7FF : Private Use Area | |||
if (c <= 0x02FAFF) // 00F800..02FAFF | |||
{ | |||
const uint8_t *table = categories_00F900_02FAFF[(c - 0x00F900) / 256]; | |||
const uint8_t *table = categories_00F800_02FAFF[(c - 0x00F800) / 256]; | |||
return (ucd::category)table[c % 256]; | |||
} | |||
if (c <= 0x0DFFFF) return Cn; // 02FB00..0DFFFF : Unassigned |
@@ -202,6 +202,7 @@ namespace ucd | |||
Phnx, /**< @brief Phoenician Script */ | |||
Plrd, /**< @brief Miao Script */ | |||
Prti, /**< @brief Inscriptional Parthian Script */ | |||
Qaak, /**< @brief Klingon Script */ | |||
Rjng, /**< @brief Rejang Script */ | |||
Runr, /**< @brief Runic Script */ | |||
Samr, /**< @brief Samaritan Script */ |
@@ -1309,6 +1309,26 @@ static const uint8_t scripts_00D700[256] = | |||
/* F0 */ Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Zzzz, Zzzz, Zzzz, Zzzz, | |||
}; | |||
static const uint8_t scripts_00F800[256] = | |||
{ | |||
/* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* D0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, | |||
/* E0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||
/* F0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zyyy, Zyyy, Zyyy, | |||
}; | |||
static const uint8_t scripts_00FA00[256] = | |||
{ | |||
/* 00 */ Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, | |||
@@ -2489,8 +2509,9 @@ static const uint8_t *scripts_000000_00D7FF[] = | |||
scripts_00D700, | |||
}; | |||
static const uint8_t *scripts_00F900_02FAFF[] = | |||
static const uint8_t *scripts_00F800_02FAFF[] = | |||
{ | |||
scripts_00F800, | |||
scripts_Hant, // 00F900 | |||
scripts_00FA00, | |||
scripts_00FB00, | |||
@@ -3020,10 +3041,10 @@ ucd::script ucd::lookup_script(codepoint_t c) | |||
const uint8_t *table = scripts_000000_00D7FF[(c - 0x000000) / 256]; | |||
return (ucd::script)table[c % 256]; | |||
} | |||
if (c <= 0x00F8FF) return Zzzz; // 00D800..00F8FF : Surrogates / Private Use Area | |||
if (c <= 0x02FAFF) // 00F900..02FAFF | |||
if (c <= 0x00F7FF) return Zzzz; // 00D800..00F7FF : Surrogates / Private Use Area | |||
if (c <= 0x02FAFF) // 00F800..02FAFF | |||
{ | |||
const uint8_t *table = scripts_00F900_02FAFF[(c - 0x00F900) / 256]; | |||
const uint8_t *table = scripts_00F800_02FAFF[(c - 0x00F800) / 256]; | |||
return (ucd::script)table[c % 256]; | |||
} | |||
if (c <= 0x0DFFFF) return Zzzz; // 02FB00..0DFFFF : Unassigned |
@@ -0,0 +1,41 @@ | |||
# Code Point ; Script ; General Category ; Name ; Okrand Transliteration | |||
F8D0;Qaak;Lo;KLINGON LETTER A;a | |||
F8D1;Qaak;Lo;KLINGON LETTER B;a | |||
F8D2;Qaak;Lo;KLINGON LETTER CH;ch | |||
F8D3;Qaak;Lo;KLINGON LETTER D;D | |||
F8D4;Qaak;Lo;KLINGON LETTER E;e | |||
F8D5;Qaak;Lo;KLINGON LETTER GH;gh | |||
F8D6;Qaak;Lo;KLINGON LETTER H;H | |||
F8D7;Qaak;Lo;KLINGON LETTER I;I | |||
F8D8;Qaak;Lo;KLINGON LETTER J;j | |||
F8D9;Qaak;Lo;KLINGON LETTER L;l | |||
F8DA;Qaak;Lo;KLINGON LETTER M;m | |||
F8DB;Qaak;Lo;KLINGON LETTER N;n | |||
F8DC;Qaak;Lo;KLINGON LETTER NG;ng | |||
F8DD;Qaak;Lo;KLINGON LETTER O;o | |||
F8DE;Qaak;Lo;KLINGON LETTER P;p | |||
F8DF;Qaak;Lo;KLINGON LETTER Q;q | |||
F8E0;Qaak;Lo;KLINGON LETTER QH;Q | |||
F8E1;Qaak;Lo;KLINGON LETTER R;r | |||
F8E2;Qaak;Lo;KLINGON LETTER S;S | |||
F8E3;Qaak;Lo;KLINGON LETTER T;t | |||
F8E4;Qaak;Lo;KLINGON LETTER TLH;tlh | |||
F8E5;Qaak;Lo;KLINGON LETTER U;u | |||
F8E6;Qaak;Lo;KLINGON LETTER V;v | |||
F8E7;Qaak;Lo;KLINGON LETTER W;w | |||
F8E8;Qaak;Lo;KLINGON LETTER Y;y | |||
F8E9;Qaak;Lo;KLINGON LETTER GLOTTAL STOP;' | |||
F8F0;Qaak;Nd;KLINGON DIGIT ZERO;0 | |||
F8F1;Qaak;Nd;KLINGON DIGIT ONE;1 | |||
F8F2;Qaak;Nd;KLINGON DIGIT TWO;2 | |||
F8F3;Qaak;Nd;KLINGON DIGIT THREE;3 | |||
F8F4;Qaak;Nd;KLINGON DIGIT FOUR;4 | |||
F8F5;Qaak;Nd;KLINGON DIGIT FIVE;5 | |||
F8F6;Qaak;Nd;KLINGON DIGIT SIX;6 | |||
F8F7;Qaak;Nd;KLINGON DIGIT SEVEN;7 | |||
F8F8;Qaak;Nd;KLINGON DIGIT EIGHT;8 | |||
F8F9;Qaak;Nd;KLINGON DIGIT NINE;9 | |||
F8FD;Zyyy;Po;KLINGON COMMA;, | |||
F8FE;Zyyy;Po;KLINGON FULL STOP;. | |||
F8FF;Zyyy;So;KLINGON MUMMIFICATION GLYPH; |
@@ -154,6 +154,7 @@ const char *get_script_string(ucd::script s) | |||
case Phnx: return "Phnx"; | |||
case Plrd: return "Plrd"; | |||
case Prti: return "Prti"; | |||
case Qaak: return "Qaak"; | |||
case Rjng: return "Rjng"; | |||
case Runr: return "Runr"; | |||
case Samr: return "Samr"; |
@@ -28,6 +28,9 @@ unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['GeneralCategory'] | |||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['GeneralCategory'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that | |||
@@ -35,8 +38,8 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
category_sets = [ | |||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'), | |||
(ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'), | |||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'), | |||
(ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'), | |||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'), |
@@ -35,6 +35,13 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint]['Script'] = data['Script'] | |||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data | |||
unicode_chars[codepoint]['Properties'] = [] | |||
unicode_chars[codepoint]['UpperCase'] = ucd.CodePoint('0000') | |||
unicode_chars[codepoint]['LowerCase'] = ucd.CodePoint('0000') | |||
unicode_chars[codepoint]['TitleCase'] = ucd.CodePoint('0000') | |||
null = ucd.CodePoint('0000') | |||
if __name__ == '__main__': |
@@ -28,14 +28,17 @@ unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint] = data['Script'] | |||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['Script'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that | |||
# need to be generated. | |||
script_sets = [ | |||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('00D800..00F8FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||
(ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), |
@@ -147,6 +147,14 @@ data_items = { | |||
('LowerCase', codepoint), | |||
('TitleCase', codepoint), | |||
], | |||
# Supplemental Data: | |||
'Klingon': [ | |||
('CodePoint', codepoint), | |||
('Script', str), | |||
('GeneralCategory', string), | |||
('Name', string), | |||
('Transliteration', string), | |||
], | |||
} | |||
def parse_ucd_data(ucd_rootdir, dataset): |