| UCD_VERSION=6.2.0 | UCD_VERSION=6.2.0 | ||||
| UCD_ROOTDIR=data/ucd | UCD_ROOTDIR=data/ucd | ||||
| data/language-subtag-registry: | |||||
| mkdir -pv data | |||||
| wget -O $@ http://www.iana.org/assignments/language-subtag-registry | |||||
| data/ucd/PropList.txt: | data/ucd/PropList.txt: | ||||
| mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | ||||
| data/ucd/Scripts.txt: | |||||
| mkdir -pv data/ucd | |||||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt | |||||
| data/ucd/UnicodeData.txt: | data/ucd/UnicodeData.txt: | ||||
| mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | ||||
| ############################# libucd ########################################## | ############################# libucd ########################################## | ||||
| src/case.cpp: tools/case.py tools/ucd.py data/ucd/UnicodeData.txt | |||||
| src/case.cpp: tools/case.py tools/ucd.py \ | |||||
| data/ucd/UnicodeData.txt | |||||
| tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
| src/categories.cpp: tools/categories.py tools/ucd.py data/ucd/UnicodeData.txt | |||||
| src/categories.cpp: tools/categories.py tools/ucd.py \ | |||||
| data/ucd/UnicodeData.txt | |||||
| tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
| src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||||
| data/language-subtag-registry \ | |||||
| data/ucd/Scripts.txt | |||||
| tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||||
| libucd_includedir = $(includedir)/ucd | libucd_includedir = $(includedir)/ucd | ||||
| libucd_include_HEADERS = \ | libucd_include_HEADERS = \ | ||||
| src/include/ucd/ucd.h | src/include/ucd/ucd.h | ||||
| src_libucd_la_SOURCES = \ | src_libucd_la_SOURCES = \ | ||||
| src/case.cpp \ | src/case.cpp \ | ||||
| src/categories.cpp \ | src/categories.cpp \ | ||||
| src/ctype.cpp | |||||
| src/ctype.cpp \ | |||||
| src/scripts.cpp | |||||
| ############################# tests ########################################### | ############################# tests ########################################### | ||||
| category lookup_category(codepoint_t c); | category lookup_category(codepoint_t c); | ||||
| //@} | |||||
| /** @name Unicode Script | |||||
| * @brief These functions query the Script property of Unicode codepoints. | |||||
| */ | |||||
| //@{ | |||||
| /** @brief Unicode Script | |||||
| * @see http://www.iana.org/assignments/language-subtag-registry | |||||
| * @see http://www.unicode.org/iso15924/iso15924-codes.html | |||||
| */ | |||||
| enum script | |||||
| { | |||||
| Arab, /**< @brief Arabic Script */ | |||||
| Armi, /**< @brief Imperial Aramaic Script */ | |||||
| Armn, /**< @brief Armenian Script */ | |||||
| Avst, /**< @brief Avestan Script */ | |||||
| Bali, /**< @brief Balinese Script */ | |||||
| Bamu, /**< @brief Bamum Script */ | |||||
| Batk, /**< @brief Batak Script */ | |||||
| Beng, /**< @brief Bengali Script */ | |||||
| Bopo, /**< @brief Bopomofo Script */ | |||||
| Brah, /**< @brief Brahmi Script */ | |||||
| Brai, /**< @brief Braille Script */ | |||||
| Bugi, /**< @brief Buginese Script */ | |||||
| Buhd, /**< @brief Buhid Script */ | |||||
| Cans, /**< @brief Unified Canadian Aboriginal Syllabics */ | |||||
| Cari, /**< @brief Carian Script */ | |||||
| Cakm, /**< @brief Chakma Script */ | |||||
| Cham, /**< @brief Cham Script */ | |||||
| Cher, /**< @brief Cherokee Script */ | |||||
| Copt, /**< @brief Coptic Script */ | |||||
| Cprt, /**< @brief Cypriot Script */ | |||||
| Cyrl, /**< @brief Cyrillic Script */ | |||||
| Deva, /**< @brief Devanagari Script */ | |||||
| Dsrt, /**< @brief Deseret Script */ | |||||
| Egyp, /**< @brief Egyptian Hiegoglyphs */ | |||||
| Ethi, /**< @brief Ethiopic Script */ | |||||
| Geor, /**< @brief Geirgian Script */ | |||||
| Glag, /**< @brief Glagolitic Script */ | |||||
| Goth, /**< @brief Gothic Script */ | |||||
| Grek, /**< @brief Greek Script */ | |||||
| Gujr, /**< @brief Gujarati Script */ | |||||
| Guru, /**< @brief Gurmukhi Script */ | |||||
| Hang, /**< @brief Hangul Script */ | |||||
| Hano, /**< @brief Hanunoo Script */ | |||||
| Hant, /**< @brief Han (Traditional) Script */ | |||||
| Hebr, /**< @brief Hebrew Script */ | |||||
| Hira, /**< @brief Hiragana Script */ | |||||
| Ital, /**< @brief Old Italic Script */ | |||||
| Java, /**< @brief Javanese Script */ | |||||
| Kali, /**< @brief Kayah Li Script */ | |||||
| Kana, /**< @brief Katakana Script */ | |||||
| Khar, /**< @brief Kharoshthi Script */ | |||||
| Khmr, /**< @brief Khmer Script */ | |||||
| Knda, /**< @brief Kannada Script */ | |||||
| Kthi, /**< @brief Kaithi Script */ | |||||
| Lana, /**< @brief Tai Tham Script */ | |||||
| Laoo, /**< @brief Lao Script */ | |||||
| Latn, /**< @brief Latin Script */ | |||||
| Lepc, /**< @brief Lepcha Script */ | |||||
| Limb, /**< @brief Limbu Script */ | |||||
| Linb, /**< @brief Linear B Script */ | |||||
| Lisu, /**< @brief Lisu Script */ | |||||
| Lyci, /**< @brief Lycian Script */ | |||||
| Lydi, /**< @brief Lydian Script */ | |||||
| Mand, /**< @brief Mandaic Script */ | |||||
| Merc, /**< @brief Meroitic Cursive Script */ | |||||
| Mero, /**< @brief Meroitic Hieroglyphs */ | |||||
| Mlym, /**< @brief Malayalam Script */ | |||||
| Mong, /**< @brief Mongolian Script */ | |||||
| Mtei, /**< @brief Meitei Mayek Script */ | |||||
| Mymr, /**< @brief Myanmar Script */ | |||||
| Nkoo, /**< @brief N'Ko Script */ | |||||
| Ogam, /**< @brief Ogham Script */ | |||||
| Olck, /**< @brief Ol Chiki Script */ | |||||
| Orkh, /**< @brief Old Turkic Script */ | |||||
| Orya, /**< @brief Oriya Script */ | |||||
| Osma, /**< @brief Osmanya Script */ | |||||
| Phag, /**< @brief Phags-Pa Script */ | |||||
| Phli, /**< @brief Inscriptional Pahlavi Script */ | |||||
| Phnx, /**< @brief Phoenician Script */ | |||||
| Plrd, /**< @brief Miao Script */ | |||||
| Prti, /**< @brief Inscriptional Parthian Script */ | |||||
| Rjng, /**< @brief Rejang Script */ | |||||
| Runr, /**< @brief Runic Script */ | |||||
| Samr, /**< @brief Samaritan Script */ | |||||
| Sarb, /**< @brief Old South Arabian Script */ | |||||
| Saur, /**< @brief Saurashtra Script */ | |||||
| Shaw, /**< @brief Shavian Script */ | |||||
| Shrd, /**< @brief Sharada Script */ | |||||
| Sinh, /**< @brief Sinhala Script */ | |||||
| Sora, /**< @brief Sora Sompeng Script */ | |||||
| Sund, /**< @brief Sundanese Script */ | |||||
| Sylo, /**< @brief Syloti Nagri Script */ | |||||
| Syrn, /**< @brief Syriatic (Eastern) Script */ | |||||
| Tagb, /**< @brief Tagbanwa Script */ | |||||
| Takr, /**< @brief Takri Script */ | |||||
| Tale, /**< @brief Tai Le Script */ | |||||
| Talu, /**< @brief New Tai Lue Script */ | |||||
| Taml, /**< @brief Tamil Script */ | |||||
| Tavt, /**< @brief Tai Viet Script */ | |||||
| Telu, /**< @brief Telugu Script */ | |||||
| Tfng, /**< @brief Tifinagh Script */ | |||||
| Tglg, /**< @brief Tagalog Script */ | |||||
| Thaa, /**< @brief Thaana Script */ | |||||
| Thai, /**< @brief Thai Script */ | |||||
| Tibt, /**< @brief Tibetan Script */ | |||||
| Ugar, /**< @brief Ugaritic Script */ | |||||
| Vaii, /**< @brief Vai Script */ | |||||
| Xpeo, /**< @brief Old Persian Script */ | |||||
| Xsux, /**< @brief Cuneiform Script */ | |||||
| Yiii, /**< @brief Yi Script */ | |||||
| Zyyy, /**< @brief Inherited Script */ | |||||
| Zzzz, /**< @brief Unknown Script */ | |||||
| }; | |||||
| /** @brief Lookup the Script for a Unicode codepoint. | |||||
| * | |||||
| * @param c The Unicode codepoint to lookup. | |||||
| * @return The Script of the Unicode codepoint. | |||||
| */ | |||||
| script lookup_script(codepoint_t c); | |||||
| //@} | //@} | ||||
| /** @name ctype-style APIs | /** @name ctype-style APIs | ||||
| * @brief These functions provide wctype compatible functions using the UCD data. | * @brief These functions provide wctype compatible functions using the UCD data. |
| } | } | ||||
| } | } | ||||
| const char *get_script_string(ucd::script s) | |||||
| { | |||||
| using namespace ucd; | |||||
| switch (s) | |||||
| { | |||||
| case Arab: return "Arab"; | |||||
| case Armi: return "Armi"; | |||||
| case Armn: return "Armn"; | |||||
| case Avst: return "Avst"; | |||||
| case Bali: return "Bali"; | |||||
| case Bamu: return "Bamu"; | |||||
| case Batk: return "Batk"; | |||||
| case Beng: return "Beng"; | |||||
| case Bopo: return "Bopo"; | |||||
| case Brah: return "Brah"; | |||||
| case Brai: return "Brai"; | |||||
| case Bugi: return "Bugi"; | |||||
| case Buhd: return "Buhd"; | |||||
| case Cans: return "Cans"; | |||||
| case Cari: return "Cari"; | |||||
| case Cakm: return "Cakm"; | |||||
| case Cham: return "Cham"; | |||||
| case Cher: return "Cher"; | |||||
| case Copt: return "Copt"; | |||||
| case Cprt: return "Cprt"; | |||||
| case Cyrl: return "Cyrl"; | |||||
| case Deva: return "Deva"; | |||||
| case Dsrt: return "Dsrt"; | |||||
| case Egyp: return "Egyp"; | |||||
| case Ethi: return "Ethi"; | |||||
| case Geor: return "Geor"; | |||||
| case Glag: return "Glag"; | |||||
| case Goth: return "Goth"; | |||||
| case Grek: return "Grek"; | |||||
| case Gujr: return "Gujr"; | |||||
| case Guru: return "Guru"; | |||||
| case Hang: return "Hang"; | |||||
| case Hano: return "Hano"; | |||||
| case Hant: return "Hant"; | |||||
| case Hebr: return "Hebr"; | |||||
| case Hira: return "Hira"; | |||||
| case Ital: return "Ital"; | |||||
| case Java: return "Java"; | |||||
| case Kali: return "Kali"; | |||||
| case Kana: return "Kana"; | |||||
| case Khar: return "Khar"; | |||||
| case Khmr: return "Khmr"; | |||||
| case Knda: return "Knda"; | |||||
| case Kthi: return "Kthi"; | |||||
| case Lana: return "Lana"; | |||||
| case Laoo: return "Laoo"; | |||||
| case Latn: return "Latn"; | |||||
| case Lepc: return "Lepc"; | |||||
| case Limb: return "Limb"; | |||||
| case Linb: return "Linb"; | |||||
| case Lisu: return "Lisu"; | |||||
| case Lyci: return "Lyci"; | |||||
| case Lydi: return "Lydi"; | |||||
| case Mand: return "Mand"; | |||||
| case Merc: return "Merc"; | |||||
| case Mero: return "Mero"; | |||||
| case Mlym: return "Mlym"; | |||||
| case Mong: return "Mong"; | |||||
| case Mtei: return "Mtei"; | |||||
| case Mymr: return "Mymr"; | |||||
| case Nkoo: return "Nkoo"; | |||||
| case Ogam: return "Ogam"; | |||||
| case Olck: return "Olck"; | |||||
| case Orkh: return "Orkh"; | |||||
| case Orya: return "Orya"; | |||||
| case Osma: return "Osma"; | |||||
| case Phag: return "Phag"; | |||||
| case Phli: return "Phli"; | |||||
| case Phnx: return "Phnx"; | |||||
| case Plrd: return "Plrd"; | |||||
| case Prti: return "Prti"; | |||||
| case Rjng: return "Rjng"; | |||||
| case Runr: return "Runr"; | |||||
| case Samr: return "Samr"; | |||||
| case Sarb: return "Sarb"; | |||||
| case Saur: return "Saur"; | |||||
| case Shaw: return "Shaw"; | |||||
| case Shrd: return "Shrd"; | |||||
| case Sinh: return "Sinh"; | |||||
| case Sora: return "Sora"; | |||||
| case Sund: return "Sund"; | |||||
| case Sylo: return "Sylo"; | |||||
| case Syrn: return "Syrn"; | |||||
| case Tagb: return "Tagb"; | |||||
| case Takr: return "Takr"; | |||||
| case Tale: return "Tale"; | |||||
| case Talu: return "Talu"; | |||||
| case Taml: return "Taml"; | |||||
| case Tavt: return "Tavt"; | |||||
| case Telu: return "Telu"; | |||||
| case Tfng: return "Tfng"; | |||||
| case Tglg: return "Tglg"; | |||||
| case Thaa: return "Thaa"; | |||||
| case Thai: return "Thai"; | |||||
| case Tibt: return "Tibt"; | |||||
| case Ugar: return "Ugar"; | |||||
| case Vaii: return "Vaii"; | |||||
| case Xpeo: return "Xpeo"; | |||||
| case Xsux: return "Xsux"; | |||||
| case Yiii: return "Yiii"; | |||||
| case Zyyy: return "Zyyy"; | |||||
| case Zzzz: return "Zzzz"; | |||||
| default: return "----"; | |||||
| } | |||||
| } | |||||
| int main() | int main() | ||||
| { | { | ||||
| for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | ||||
| { | { | ||||
| const char *script = get_script_string(ucd::lookup_script(c)); | |||||
| const char *category = get_category_string(ucd::lookup_category(c)); | const char *category = get_category_string(ucd::lookup_category(c)); | ||||
| const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | ||||
| ucd::codepoint_t upper = ucd::toupper(c); | ucd::codepoint_t upper = ucd::toupper(c); | ||||
| ucd::codepoint_t lower = ucd::tolower(c); | ucd::codepoint_t lower = ucd::tolower(c); | ||||
| ucd::codepoint_t title = ucd::totitle(c); | ucd::codepoint_t title = ucd::totitle(c); | ||||
| const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | ||||
| printf("%06X %s %s %06X %06X %06X %s\n", | |||||
| c, category_group, category, | |||||
| printf("%06X %s %s %s %06X %06X %06X %s\n", | |||||
| c, script, category_group, category, | |||||
| upper, lower, title, | upper, lower, title, | ||||
| whitespace); | whitespace); | ||||
| } | } |
| #!/usr/bin/python | |||||
| # Copyright (C) 2012 Reece H. Dunn | |||||
| # | |||||
| # This file is part of ucd-tools. | |||||
| # | |||||
| # ucd-tools is free software: you can redistribute it and/or modify | |||||
| # it under the terms of the GNU General Public License as published by | |||||
| # the Free Software Foundation, either version 3 of the License, or | |||||
| # (at your option) any later version. | |||||
| # | |||||
| # ucd-tools is distributed in the hope that it will be useful, | |||||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| # GNU General Public License for more details. | |||||
| # | |||||
| # You should have received a copy of the GNU General Public License | |||||
| # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
| import os | |||||
| def read_data(path, split_char=':'): | |||||
| with open(path) as f: | |||||
| for line in f: | |||||
| line = line.replace('\n', '') | |||||
| if not line.startswith('#'): | |||||
| yield line.split(split_char) | |||||
| def fold_lines(path): | |||||
| next_line = None | |||||
| with open(path) as f: | |||||
| for line in f: | |||||
| line = line.replace('\n', '') | |||||
| if line.startswith(' '): | |||||
| next_line = '%s%s' % (next_line, line[1:]) | |||||
| continue | |||||
| if next_line: | |||||
| yield next_line | |||||
| next_line = line | |||||
| def iana_subtag_entries(path): | |||||
| tag = {} | |||||
| for line in fold_lines(path): | |||||
| if line == '%%': | |||||
| if 'Type' in tag: | |||||
| yield tag | |||||
| tag = {} | |||||
| continue | |||||
| packed = line.split(': ') | |||||
| key = packed[0] | |||||
| value = ': '.join(packed[1:]) | |||||
| if key == 'Description': | |||||
| # Only select the first Description. This handles subtag codes | |||||
| # that have multiple descriptions (e.g. 'es' maps to "Spanish" | |||||
| # and "Castilian"). | |||||
| if not key in tag.keys(): | |||||
| tag[key] = value | |||||
| else: | |||||
| tag[key] = value | |||||
| yield tag | |||||
| typemap = { | |||||
| 'extlang': 'ExtLang', | |||||
| 'grandfathered': 'Grandfathered', | |||||
| 'language': 'Language', | |||||
| 'redundant': 'Redundant', | |||||
| 'region': 'Region', | |||||
| 'script': 'Script', | |||||
| 'variant': 'Variant', | |||||
| } | |||||
| scopemap = { | |||||
| 'collection': 'Collection', | |||||
| 'macrolanguage': 'MacroLanguage', | |||||
| 'special': 'Special', | |||||
| 'private-use': 'PrivateUse', | |||||
| } | |||||
| def read_iana_subtags(path): | |||||
| tags = {} | |||||
| for tag in iana_subtag_entries(path): | |||||
| if 'Subtag' in tag.keys(): | |||||
| ref = tag['Subtag'] | |||||
| del tag['Subtag'] | |||||
| else: | |||||
| ref = tag['Tag'] | |||||
| del tag['Tag'] | |||||
| if 'Scope' in tag.keys(): | |||||
| if tag['Type'] != 'language': | |||||
| raise Exception('"Scope" property unexpected for Type="%s"' % tag['Type']) | |||||
| tag['Type'] = scopemap[ tag['Scope'] ] | |||||
| del tag['Scope'] | |||||
| else: | |||||
| tag['Type'] = typemap[ tag['Type'] ] | |||||
| if '..' not in ref: # exclude private use definitions | |||||
| tags[ref] = tag | |||||
| return tags |
| if data['Property'] in ['White_Space']: | if data['Property'] in ['White_Space']: | ||||
| for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
| unicode_chars[codepoint]['Properties'].append(data['Property']) | unicode_chars[codepoint]['Properties'].append(data['Property']) | ||||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||||
| for codepoint in data['Range']: | |||||
| unicode_chars[codepoint]['Script'] = data['Script'] | |||||
| null = ucd.CodePoint('0000') | null = ucd.CodePoint('0000') | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| for codepoint in ucd.CodeRange('000000..10FFFF'): | for codepoint in ucd.CodeRange('000000..10FFFF'): | ||||
| try: | try: | ||||
| data = unicode_chars[codepoint] | data = unicode_chars[codepoint] | ||||
| title = data['TitleCase'] | |||||
| upper = data['UpperCase'] | |||||
| lower = data['LowerCase'] | |||||
| if title == null: title = codepoint | |||||
| if upper == null: upper = codepoint | |||||
| if lower == null: lower = codepoint | |||||
| print '%s %s %s %s %s %s %s' % ( | |||||
| codepoint, data['GeneralCategory'][0], data['GeneralCategory'], | |||||
| upper, lower, title, | |||||
| ' '.join(data['Properties'])) | |||||
| except KeyError: | except KeyError: | ||||
| print '%s C Cn %s %s %s ' % (codepoint, codepoint, codepoint, codepoint) | |||||
| data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []} | |||||
| try: | |||||
| script = data['Script'] | |||||
| except KeyError: | |||||
| script = 'Zzzz' | |||||
| title = data['TitleCase'] | |||||
| upper = data['UpperCase'] | |||||
| lower = data['LowerCase'] | |||||
| if title == null: title = codepoint | |||||
| if upper == null: upper = codepoint | |||||
| if lower == null: lower = codepoint | |||||
| print '%s %s %s %s %s %s %s %s' % ( | |||||
| codepoint, script, | |||||
| data['GeneralCategory'][0], data['GeneralCategory'], | |||||
| upper, lower, title, | |||||
| ' '.join(data['Properties'])) |
| #!/usr/bin/python | |||||
| # Copyright (C) 2012 Reece H. Dunn | |||||
| # | |||||
| # This file is part of ucd-tools. | |||||
| # | |||||
| # ucd-tools is free software: you can redistribute it and/or modify | |||||
| # it under the terms of the GNU General Public License as published by | |||||
| # the Free Software Foundation, either version 3 of the License, or | |||||
| # (at your option) any later version. | |||||
| # | |||||
| # ucd-tools is distributed in the hope that it will be useful, | |||||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| # GNU General Public License for more details. | |||||
| # | |||||
| # You should have received a copy of the GNU General Public License | |||||
| # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
| import os | |||||
| import sys | |||||
| import ucd | |||||
| ucd_rootdir = sys.argv[1] | |||||
| ucd_version = sys.argv[2] | |||||
| unicode_chars = {} | |||||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||||
| for codepoint in data['Range']: | |||||
| unicode_chars[codepoint] = data['Script'] | |||||
| # This map is a combination of the information in the UnicodeData and Blocks | |||||
| # data files. It is intended to reduce the number of character tables that | |||||
| # need to be generated. | |||||
| script_sets = [ | |||||
| (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||||
| (ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'), | |||||
| (ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'), | |||||
| (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||||
| (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||||
| (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||||
| (ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'), | |||||
| (ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'), | |||||
| (ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'), | |||||
| (ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'), | |||||
| (ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'), | |||||
| ] | |||||
| # These scripts have many pages consisting of just this script: | |||||
| special_scripts = [] | |||||
| script_tables = {} | |||||
| for codepoints, script, comment in script_sets: | |||||
| if not script: | |||||
| table = {} | |||||
| table_entry = None | |||||
| table_codepoint = None | |||||
| table_script = None | |||||
| for i, codepoint in enumerate(codepoints): | |||||
| try: | |||||
| script = unicode_chars[codepoint] | |||||
| except KeyError: | |||||
| script = 'Zzzz' # Unknown | |||||
| if (i % 256) == 0: | |||||
| if table_entry: | |||||
| if table_script in special_scripts: | |||||
| table[table_codepoint] = table_script | |||||
| elif table_script: | |||||
| special_scripts.append(table_script) | |||||
| table[table_codepoint] = table_script | |||||
| else: | |||||
| table[table_codepoint] = table_entry | |||||
| table_entry = [] | |||||
| table_codepoint = codepoint | |||||
| table_script = script | |||||
| if script != table_script: | |||||
| table_script = None | |||||
| table_entry.append(script) | |||||
| if table_entry: | |||||
| if table_script in special_scripts: | |||||
| table[table_codepoint] = table_script | |||||
| else: | |||||
| table[table_codepoint] = table_entry | |||||
| script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table | |||||
| if __name__ == '__main__': | |||||
| sys.stdout.write("""/* Unicode Scripts | |||||
| * | |||||
| * Copyright (C) 2012 Reece H. Dunn | |||||
| * | |||||
| * This file is part of ucd-tools. | |||||
| * | |||||
| * ucd-tools is free software: you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation, either version 3 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * ucd-tools is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
| */ | |||||
| // NOTE: This file is automatically generated from the Scripts.txt file in | |||||
| // the Unicode Character database by the ucd-tools/tools/scripts.py script. | |||||
| #include "ucd/ucd.h" | |||||
| #include <stddef.h> | |||||
| using namespace ucd; | |||||
| // Unicode Character Data %s | |||||
| """ % ucd_version) | |||||
| for script in special_scripts: | |||||
| sys.stdout.write('\n') | |||||
| sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) | |||||
| sys.stdout.write('{') | |||||
| for i in range(0, 256): | |||||
| if (i % 16) == 0: | |||||
| sys.stdout.write('\n\t/* %02X */' % i) | |||||
| sys.stdout.write(' %s,' % script) | |||||
| sys.stdout.write('\n};\n') | |||||
| for codepoints, script, comment in script_sets: | |||||
| if not script: | |||||
| tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] | |||||
| for codepoint in sorted(tables.keys()): | |||||
| table = tables[codepoint] | |||||
| if table in special_scripts: | |||||
| continue | |||||
| sys.stdout.write('\n') | |||||
| sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) | |||||
| sys.stdout.write('{') | |||||
| for i, script in enumerate(table): | |||||
| if (i % 16) == 0: | |||||
| sys.stdout.write('\n\t/* %02X */' % i) | |||||
| sys.stdout.write(' %s,' % script) | |||||
| sys.stdout.write('\n};\n') | |||||
| for codepoints, script, comment in script_sets: | |||||
| if not script: | |||||
| table_index = '%s_%s' % (codepoints.first, codepoints.last) | |||||
| sys.stdout.write('\n') | |||||
| sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) | |||||
| sys.stdout.write('{\n') | |||||
| for codepoint, table in sorted(script_tables[table_index].items()): | |||||
| if isinstance(table, str): | |||||
| sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint)) | |||||
| else: | |||||
| sys.stdout.write('\tscripts_%s,\n' % codepoint) | |||||
| sys.stdout.write('};\n') | |||||
| sys.stdout.write('\n') | |||||
| sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n') | |||||
| sys.stdout.write('{\n') | |||||
| for codepoints, script, comment in script_sets: | |||||
| if script: | |||||
| sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment)) | |||||
| else: | |||||
| sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) | |||||
| sys.stdout.write('\t{\n') | |||||
| sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) | |||||
| sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n') | |||||
| sys.stdout.write('\t}\n') | |||||
| sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n') | |||||
| sys.stdout.write('}\n') |
| import os | import os | ||||
| import sys | import sys | ||||
| import iana | |||||
| script_map = { | |||||
| # UCD script names not derivable from IANA script tags: | |||||
| 'Canadian_Aboriginal': 'Cans', | |||||
| 'Common': 'Zyyy', | |||||
| 'Egyptian_Hieroglyphs': 'Egyp', | |||||
| 'Inherited': 'Zyyy', | |||||
| 'Meetei_Mayek': 'Mtei', | |||||
| 'Nko': 'Nkoo', | |||||
| 'Phags_Pa': 'Phag', | |||||
| # Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA: | |||||
| 'Cuneiform': 'Xsux', | |||||
| } | |||||
| for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items(): | |||||
| if tag['Type'] == 'Script': | |||||
| # Convert the IANA scipt tag descriptions to the UCD script names: | |||||
| desc = tag['Description'] | |||||
| if ' (' in desc: | |||||
| desc = desc.split(' (')[0] | |||||
| desc = desc.replace(' ', '_') | |||||
| script_map[desc] = ref | |||||
| # Fix up incorrectly mapped script names: | |||||
| script_map['Cyrillic'] = 'Cyrl' | |||||
| class CodePoint: | class CodePoint: | ||||
| def __init__(self, x): | def __init__(self, x): | ||||
| return True | return True | ||||
| return False | return False | ||||
| def script(x): | |||||
| return script_map[x] | |||||
| data_items = { | data_items = { | ||||
| 'Blocks': [ | 'Blocks': [ | ||||
| ('Range', codepoint), | ('Range', codepoint), | ||||
| ], | ], | ||||
| 'Scripts': [ | 'Scripts': [ | ||||
| ('Range', codepoint), | ('Range', codepoint), | ||||
| ('Script', str), | |||||
| ('Script', script), | |||||
| ], | ], | ||||
| 'UnicodeData': [ | 'UnicodeData': [ | ||||
| ('CodePoint', codepoint), | ('CodePoint', codepoint), |