############################# libucd ########################################## | ############################# libucd ########################################## | ||||
src/case.cpp: tools/case.py tools/ucd.py \ | src/case.cpp: tools/case.py tools/ucd.py \ | ||||
supplemental/Klingon.txt \ | |||||
data/ucd/UnicodeData.txt | data/ucd/UnicodeData.txt | ||||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
src/categories.cpp: tools/categories.py tools/ucd.py \ | src/categories.cpp: tools/categories.py tools/ucd.py \ | ||||
supplemental/Klingon.txt \ | |||||
data/ucd/UnicodeData.txt | data/ucd/UnicodeData.txt | ||||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
src/scripts.cpp: tools/scripts.py tools/ucd.py \ | src/scripts.cpp: tools/scripts.py tools/ucd.py \ | ||||
supplemental/Klingon.txt \ | |||||
data/language-subtag-registry \ | data/language-subtag-registry \ | ||||
data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
tests_printucddata_LDADD = src/libucd.la | tests_printucddata_LDADD = src/libucd.la | ||||
tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | ||||
supplemental/Klingon.txt \ | |||||
data/ucd/UnicodeData.txt \ | data/ucd/UnicodeData.txt \ | ||||
data/ucd/PropList.txt | |||||
data/ucd/PropList.txt \ | |||||
data/ucd/Scripts.txt | |||||
tools/printdata.py ${UCD_ROOTDIR} > $@ | tools/printdata.py ${UCD_ROOTDIR} > $@ | ||||
tests/unicode-data.actual: tests/printucddata | tests/unicode-data.actual: tests/printucddata |
/* F0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Cn, Cn, Cn, Cn, | /* F0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Cn, Cn, Cn, Cn, | ||||
}; | }; | ||||
static const uint8_t categories_00F800[256] = | |||||
{ | |||||
/* 00 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 10 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 20 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 30 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 40 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 50 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 60 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 70 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 80 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* 90 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* A0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* B0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* C0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, | |||||
/* D0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | |||||
/* E0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Co, Co, Co, Co, Co, Co, | |||||
/* F0 */ Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Co, Co, Co, Po, Po, So, | |||||
}; | |||||
static const uint8_t categories_00FA00[256] = | static const uint8_t categories_00FA00[256] = | ||||
{ | { | ||||
/* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | /* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, | ||||
categories_00D700, | categories_00D700, | ||||
}; | }; | ||||
static const uint8_t *categories_00F900_02FAFF[] = | |||||
static const uint8_t *categories_00F800_02FAFF[] = | |||||
{ | { | ||||
categories_00F800, | |||||
categories_Lo, // 00F900 | categories_Lo, // 00F900 | ||||
categories_00FA00, | categories_00FA00, | ||||
categories_00FB00, | categories_00FB00, | ||||
return (ucd::category)table[c % 256]; | return (ucd::category)table[c % 256]; | ||||
} | } | ||||
if (c <= 0x00DFFF) return Cs; // 00D800..00DFFF : Surrogates | if (c <= 0x00DFFF) return Cs; // 00D800..00DFFF : Surrogates | ||||
if (c <= 0x00F8FF) return Co; // 00E000..00F8FF : Private Use Area | |||||
if (c <= 0x02FAFF) // 00F900..02FAFF | |||||
if (c <= 0x00F7FF) return Co; // 00E000..00F7FF : Private Use Area | |||||
if (c <= 0x02FAFF) // 00F800..02FAFF | |||||
{ | { | ||||
const uint8_t *table = categories_00F900_02FAFF[(c - 0x00F900) / 256]; | |||||
const uint8_t *table = categories_00F800_02FAFF[(c - 0x00F800) / 256]; | |||||
return (ucd::category)table[c % 256]; | return (ucd::category)table[c % 256]; | ||||
} | } | ||||
if (c <= 0x0DFFFF) return Cn; // 02FB00..0DFFFF : Unassigned | if (c <= 0x0DFFFF) return Cn; // 02FB00..0DFFFF : Unassigned |
Phnx, /**< @brief Phoenician Script */ | Phnx, /**< @brief Phoenician Script */ | ||||
Plrd, /**< @brief Miao Script */ | Plrd, /**< @brief Miao Script */ | ||||
Prti, /**< @brief Inscriptional Parthian Script */ | Prti, /**< @brief Inscriptional Parthian Script */ | ||||
Qaak, /**< @brief Klingon Script */ | |||||
Rjng, /**< @brief Rejang Script */ | Rjng, /**< @brief Rejang Script */ | ||||
Runr, /**< @brief Runic Script */ | Runr, /**< @brief Runic Script */ | ||||
Samr, /**< @brief Samaritan Script */ | Samr, /**< @brief Samaritan Script */ |
/* F0 */ Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Zzzz, Zzzz, Zzzz, Zzzz, | /* F0 */ Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Zzzz, Zzzz, Zzzz, Zzzz, | ||||
}; | }; | ||||
static const uint8_t scripts_00F800[256] = | |||||
{ | |||||
/* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* D0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, | |||||
/* E0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, | |||||
/* F0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zyyy, Zyyy, Zyyy, | |||||
}; | |||||
static const uint8_t scripts_00FA00[256] = | static const uint8_t scripts_00FA00[256] = | ||||
{ | { | ||||
/* 00 */ Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, | /* 00 */ Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, | ||||
scripts_00D700, | scripts_00D700, | ||||
}; | }; | ||||
static const uint8_t *scripts_00F900_02FAFF[] = | |||||
static const uint8_t *scripts_00F800_02FAFF[] = | |||||
{ | { | ||||
scripts_00F800, | |||||
scripts_Hant, // 00F900 | scripts_Hant, // 00F900 | ||||
scripts_00FA00, | scripts_00FA00, | ||||
scripts_00FB00, | scripts_00FB00, | ||||
const uint8_t *table = scripts_000000_00D7FF[(c - 0x000000) / 256]; | const uint8_t *table = scripts_000000_00D7FF[(c - 0x000000) / 256]; | ||||
return (ucd::script)table[c % 256]; | return (ucd::script)table[c % 256]; | ||||
} | } | ||||
if (c <= 0x00F8FF) return Zzzz; // 00D800..00F8FF : Surrogates / Private Use Area | |||||
if (c <= 0x02FAFF) // 00F900..02FAFF | |||||
if (c <= 0x00F7FF) return Zzzz; // 00D800..00F7FF : Surrogates / Private Use Area | |||||
if (c <= 0x02FAFF) // 00F800..02FAFF | |||||
{ | { | ||||
const uint8_t *table = scripts_00F900_02FAFF[(c - 0x00F900) / 256]; | |||||
const uint8_t *table = scripts_00F800_02FAFF[(c - 0x00F800) / 256]; | |||||
return (ucd::script)table[c % 256]; | return (ucd::script)table[c % 256]; | ||||
} | } | ||||
if (c <= 0x0DFFFF) return Zzzz; // 02FB00..0DFFFF : Unassigned | if (c <= 0x0DFFFF) return Zzzz; // 02FB00..0DFFFF : Unassigned |
# Code Point ; Script ; General Category ; Name ; Okrand Transliteration | |||||
F8D0;Qaak;Lo;KLINGON LETTER A;a | |||||
F8D1;Qaak;Lo;KLINGON LETTER B;a | |||||
F8D2;Qaak;Lo;KLINGON LETTER CH;ch | |||||
F8D3;Qaak;Lo;KLINGON LETTER D;D | |||||
F8D4;Qaak;Lo;KLINGON LETTER E;e | |||||
F8D5;Qaak;Lo;KLINGON LETTER GH;gh | |||||
F8D6;Qaak;Lo;KLINGON LETTER H;H | |||||
F8D7;Qaak;Lo;KLINGON LETTER I;I | |||||
F8D8;Qaak;Lo;KLINGON LETTER J;j | |||||
F8D9;Qaak;Lo;KLINGON LETTER L;l | |||||
F8DA;Qaak;Lo;KLINGON LETTER M;m | |||||
F8DB;Qaak;Lo;KLINGON LETTER N;n | |||||
F8DC;Qaak;Lo;KLINGON LETTER NG;ng | |||||
F8DD;Qaak;Lo;KLINGON LETTER O;o | |||||
F8DE;Qaak;Lo;KLINGON LETTER P;p | |||||
F8DF;Qaak;Lo;KLINGON LETTER Q;q | |||||
F8E0;Qaak;Lo;KLINGON LETTER QH;Q | |||||
F8E1;Qaak;Lo;KLINGON LETTER R;r | |||||
F8E2;Qaak;Lo;KLINGON LETTER S;S | |||||
F8E3;Qaak;Lo;KLINGON LETTER T;t | |||||
F8E4;Qaak;Lo;KLINGON LETTER TLH;tlh | |||||
F8E5;Qaak;Lo;KLINGON LETTER U;u | |||||
F8E6;Qaak;Lo;KLINGON LETTER V;v | |||||
F8E7;Qaak;Lo;KLINGON LETTER W;w | |||||
F8E8;Qaak;Lo;KLINGON LETTER Y;y | |||||
F8E9;Qaak;Lo;KLINGON LETTER GLOTTAL STOP;' | |||||
F8F0;Qaak;Nd;KLINGON DIGIT ZERO;0 | |||||
F8F1;Qaak;Nd;KLINGON DIGIT ONE;1 | |||||
F8F2;Qaak;Nd;KLINGON DIGIT TWO;2 | |||||
F8F3;Qaak;Nd;KLINGON DIGIT THREE;3 | |||||
F8F4;Qaak;Nd;KLINGON DIGIT FOUR;4 | |||||
F8F5;Qaak;Nd;KLINGON DIGIT FIVE;5 | |||||
F8F6;Qaak;Nd;KLINGON DIGIT SIX;6 | |||||
F8F7;Qaak;Nd;KLINGON DIGIT SEVEN;7 | |||||
F8F8;Qaak;Nd;KLINGON DIGIT EIGHT;8 | |||||
F8F9;Qaak;Nd;KLINGON DIGIT NINE;9 | |||||
F8FD;Zyyy;Po;KLINGON COMMA;, | |||||
F8FE;Zyyy;Po;KLINGON FULL STOP;. | |||||
F8FF;Zyyy;So;KLINGON MUMMIFICATION GLYPH; |
case Phnx: return "Phnx"; | case Phnx: return "Phnx"; | ||||
case Plrd: return "Plrd"; | case Plrd: return "Plrd"; | ||||
case Prti: return "Prti"; | case Prti: return "Prti"; | ||||
case Qaak: return "Qaak"; | |||||
case Rjng: return "Rjng"; | case Rjng: return "Rjng"; | ||||
case Runr: return "Runr"; | case Runr: return "Runr"; | ||||
case Samr: return "Samr"; | case Samr: return "Samr"; |
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
for codepoint in data['CodePoint']: | for codepoint in data['CodePoint']: | ||||
unicode_chars[codepoint] = data['GeneralCategory'] | unicode_chars[codepoint] = data['GeneralCategory'] | ||||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data['GeneralCategory'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
# data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that | ||||
category_sets = [ | category_sets = [ | ||||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | ||||
(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'), | (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'), | ||||
(ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'), | |||||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'), | |||||
(ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'), | (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'), | ||||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | ||||
(ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'), | (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'), |
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint]['Script'] = data['Script'] | unicode_chars[codepoint]['Script'] = data['Script'] | ||||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data | |||||
unicode_chars[codepoint]['Properties'] = [] | |||||
unicode_chars[codepoint]['UpperCase'] = ucd.CodePoint('0000') | |||||
unicode_chars[codepoint]['LowerCase'] = ucd.CodePoint('0000') | |||||
unicode_chars[codepoint]['TitleCase'] = ucd.CodePoint('0000') | |||||
null = ucd.CodePoint('0000') | null = ucd.CodePoint('0000') | ||||
if __name__ == '__main__': | if __name__ == '__main__': |
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint] = data['Script'] | unicode_chars[codepoint] = data['Script'] | ||||
for data in ucd.parse_ucd_data('supplemental', 'Klingon'): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data['Script'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
# data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that | ||||
# need to be generated. | # need to be generated. | ||||
script_sets = [ | script_sets = [ | ||||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | ||||
(ucd.CodeRange('00D800..00F8FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), | |||||
(ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | ||||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | ||||
(ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), | (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), |
('LowerCase', codepoint), | ('LowerCase', codepoint), | ||||
('TitleCase', codepoint), | ('TitleCase', codepoint), | ||||
], | ], | ||||
# Supplemental Data: | |||||
'Klingon': [ | |||||
('CodePoint', codepoint), | |||||
('Script', str), | |||||
('GeneralCategory', string), | |||||
('Name', string), | |||||
('Transliteration', string), | |||||
], | |||||
} | } | ||||
def parse_ucd_data(ucd_rootdir, dataset): | def parse_ucd_data(ucd_rootdir, dataset): |