| @@ -57,22 +57,37 @@ EXTRA_DIST += ChangeLog | |||
| UCD_VERSION=6.2.0 | |||
| UCD_ROOTDIR=data/ucd | |||
| data/language-subtag-registry: | |||
| mkdir -pv data | |||
| wget -O $@ http://www.iana.org/assignments/language-subtag-registry | |||
| data/ucd/PropList.txt: | |||
| mkdir -pv data/ucd | |||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | |||
| data/ucd/Scripts.txt: | |||
| mkdir -pv data/ucd | |||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt | |||
| data/ucd/UnicodeData.txt: | |||
| mkdir -pv data/ucd | |||
| wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | |||
| ############################# libucd ########################################## | |||
| src/case.cpp: tools/case.py tools/ucd.py data/ucd/UnicodeData.txt | |||
| src/case.cpp: tools/case.py tools/ucd.py \ | |||
| data/ucd/UnicodeData.txt | |||
| tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| src/categories.cpp: tools/categories.py tools/ucd.py data/ucd/UnicodeData.txt | |||
| src/categories.cpp: tools/categories.py tools/ucd.py \ | |||
| data/ucd/UnicodeData.txt | |||
| tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||
| data/language-subtag-registry \ | |||
| data/ucd/Scripts.txt | |||
| tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
| libucd_includedir = $(includedir)/ucd | |||
| libucd_include_HEADERS = \ | |||
| src/include/ucd/ucd.h | |||
| @@ -83,7 +98,8 @@ src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS} | |||
| src_libucd_la_SOURCES = \ | |||
| src/case.cpp \ | |||
| src/categories.cpp \ | |||
| src/ctype.cpp | |||
| src/ctype.cpp \ | |||
| src/scripts.cpp | |||
| ############################# tests ########################################### | |||
| @@ -118,6 +118,131 @@ namespace ucd | |||
| category lookup_category(codepoint_t c); | |||
| //@} | |||
| /** @name Unicode Script | |||
| * @brief These functions query the Script property of Unicode codepoints. | |||
| */ | |||
| //@{ | |||
| /** @brief Unicode Script | |||
| * @see http://www.iana.org/assignments/language-subtag-registry | |||
| * @see http://www.unicode.org/iso15924/iso15924-codes.html | |||
| */ | |||
| enum script | |||
| { | |||
| Arab, /**< @brief Arabic Script */ | |||
| Armi, /**< @brief Imperial Aramaic Script */ | |||
| Armn, /**< @brief Armenian Script */ | |||
| Avst, /**< @brief Avestan Script */ | |||
| Bali, /**< @brief Balinese Script */ | |||
| Bamu, /**< @brief Bamum Script */ | |||
| Batk, /**< @brief Batak Script */ | |||
| Beng, /**< @brief Bengali Script */ | |||
| Bopo, /**< @brief Bopomofo Script */ | |||
| Brah, /**< @brief Brahmi Script */ | |||
| Brai, /**< @brief Braille Script */ | |||
| Bugi, /**< @brief Buginese Script */ | |||
| Buhd, /**< @brief Buhid Script */ | |||
| Cans, /**< @brief Unified Canadian Aboriginal Syllabics */ | |||
| Cari, /**< @brief Carian Script */ | |||
| Cakm, /**< @brief Chakma Script */ | |||
| Cham, /**< @brief Cham Script */ | |||
| Cher, /**< @brief Cherokee Script */ | |||
| Copt, /**< @brief Coptic Script */ | |||
| Cprt, /**< @brief Cypriot Script */ | |||
| Cyrl, /**< @brief Cyrillic Script */ | |||
| Deva, /**< @brief Devanagari Script */ | |||
| Dsrt, /**< @brief Deseret Script */ | |||
| Egyp, /**< @brief Egyptian Hiegoglyphs */ | |||
| Ethi, /**< @brief Ethiopic Script */ | |||
| Geor, /**< @brief Geirgian Script */ | |||
| Glag, /**< @brief Glagolitic Script */ | |||
| Goth, /**< @brief Gothic Script */ | |||
| Grek, /**< @brief Greek Script */ | |||
| Gujr, /**< @brief Gujarati Script */ | |||
| Guru, /**< @brief Gurmukhi Script */ | |||
| Hang, /**< @brief Hangul Script */ | |||
| Hano, /**< @brief Hanunoo Script */ | |||
| Hant, /**< @brief Han (Traditional) Script */ | |||
| Hebr, /**< @brief Hebrew Script */ | |||
| Hira, /**< @brief Hiragana Script */ | |||
| Ital, /**< @brief Old Italic Script */ | |||
| Java, /**< @brief Javanese Script */ | |||
| Kali, /**< @brief Kayah Li Script */ | |||
| Kana, /**< @brief Katakana Script */ | |||
| Khar, /**< @brief Kharoshthi Script */ | |||
| Khmr, /**< @brief Khmer Script */ | |||
| Knda, /**< @brief Kannada Script */ | |||
| Kthi, /**< @brief Kaithi Script */ | |||
| Lana, /**< @brief Tai Tham Script */ | |||
| Laoo, /**< @brief Lao Script */ | |||
| Latn, /**< @brief Latin Script */ | |||
| Lepc, /**< @brief Lepcha Script */ | |||
| Limb, /**< @brief Limbu Script */ | |||
| Linb, /**< @brief Linear B Script */ | |||
| Lisu, /**< @brief Lisu Script */ | |||
| Lyci, /**< @brief Lycian Script */ | |||
| Lydi, /**< @brief Lydian Script */ | |||
| Mand, /**< @brief Mandaic Script */ | |||
| Merc, /**< @brief Meroitic Cursive Script */ | |||
| Mero, /**< @brief Meroitic Hieroglyphs */ | |||
| Mlym, /**< @brief Malayalam Script */ | |||
| Mong, /**< @brief Mongolian Script */ | |||
| Mtei, /**< @brief Meitei Mayek Script */ | |||
| Mymr, /**< @brief Myanmar Script */ | |||
| Nkoo, /**< @brief N'Ko Script */ | |||
| Ogam, /**< @brief Ogham Script */ | |||
| Olck, /**< @brief Ol Chiki Script */ | |||
| Orkh, /**< @brief Old Turkic Script */ | |||
| Orya, /**< @brief Oriya Script */ | |||
| Osma, /**< @brief Osmanya Script */ | |||
| Phag, /**< @brief Phags-Pa Script */ | |||
| Phli, /**< @brief Inscriptional Pahlavi Script */ | |||
| Phnx, /**< @brief Phoenician Script */ | |||
| Plrd, /**< @brief Miao Script */ | |||
| Prti, /**< @brief Inscriptional Parthian Script */ | |||
| Rjng, /**< @brief Rejang Script */ | |||
| Runr, /**< @brief Runic Script */ | |||
| Samr, /**< @brief Samaritan Script */ | |||
| Sarb, /**< @brief Old South Arabian Script */ | |||
| Saur, /**< @brief Saurashtra Script */ | |||
| Shaw, /**< @brief Shavian Script */ | |||
| Shrd, /**< @brief Sharada Script */ | |||
| Sinh, /**< @brief Sinhala Script */ | |||
| Sora, /**< @brief Sora Sompeng Script */ | |||
| Sund, /**< @brief Sundanese Script */ | |||
| Sylo, /**< @brief Syloti Nagri Script */ | |||
| Syrn, /**< @brief Syriatic (Eastern) Script */ | |||
| Tagb, /**< @brief Tagbanwa Script */ | |||
| Takr, /**< @brief Takri Script */ | |||
| Tale, /**< @brief Tai Le Script */ | |||
| Talu, /**< @brief New Tai Lue Script */ | |||
| Taml, /**< @brief Tamil Script */ | |||
| Tavt, /**< @brief Tai Viet Script */ | |||
| Telu, /**< @brief Telugu Script */ | |||
| Tfng, /**< @brief Tifinagh Script */ | |||
| Tglg, /**< @brief Tagalog Script */ | |||
| Thaa, /**< @brief Thaana Script */ | |||
| Thai, /**< @brief Thai Script */ | |||
| Tibt, /**< @brief Tibetan Script */ | |||
| Ugar, /**< @brief Ugaritic Script */ | |||
| Vaii, /**< @brief Vai Script */ | |||
| Xpeo, /**< @brief Old Persian Script */ | |||
| Xsux, /**< @brief Cuneiform Script */ | |||
| Yiii, /**< @brief Yi Script */ | |||
| Zyyy, /**< @brief Inherited Script */ | |||
| Zzzz, /**< @brief Unknown Script */ | |||
| }; | |||
| /** @brief Lookup the Script for a Unicode codepoint. | |||
| * | |||
| * @param c The Unicode codepoint to lookup. | |||
| * @return The Script of the Unicode codepoint. | |||
| */ | |||
| script lookup_script(codepoint_t c); | |||
| //@} | |||
| /** @name ctype-style APIs | |||
| * @brief These functions provide wctype compatible functions using the UCD data. | |||
| @@ -78,18 +78,130 @@ const char *get_category_string(ucd::category c) | |||
| } | |||
| } | |||
| const char *get_script_string(ucd::script s) | |||
| { | |||
| using namespace ucd; | |||
| switch (s) | |||
| { | |||
| case Arab: return "Arab"; | |||
| case Armi: return "Armi"; | |||
| case Armn: return "Armn"; | |||
| case Avst: return "Avst"; | |||
| case Bali: return "Bali"; | |||
| case Bamu: return "Bamu"; | |||
| case Batk: return "Batk"; | |||
| case Beng: return "Beng"; | |||
| case Bopo: return "Bopo"; | |||
| case Brah: return "Brah"; | |||
| case Brai: return "Brai"; | |||
| case Bugi: return "Bugi"; | |||
| case Buhd: return "Buhd"; | |||
| case Cans: return "Cans"; | |||
| case Cari: return "Cari"; | |||
| case Cakm: return "Cakm"; | |||
| case Cham: return "Cham"; | |||
| case Cher: return "Cher"; | |||
| case Copt: return "Copt"; | |||
| case Cprt: return "Cprt"; | |||
| case Cyrl: return "Cyrl"; | |||
| case Deva: return "Deva"; | |||
| case Dsrt: return "Dsrt"; | |||
| case Egyp: return "Egyp"; | |||
| case Ethi: return "Ethi"; | |||
| case Geor: return "Geor"; | |||
| case Glag: return "Glag"; | |||
| case Goth: return "Goth"; | |||
| case Grek: return "Grek"; | |||
| case Gujr: return "Gujr"; | |||
| case Guru: return "Guru"; | |||
| case Hang: return "Hang"; | |||
| case Hano: return "Hano"; | |||
| case Hant: return "Hant"; | |||
| case Hebr: return "Hebr"; | |||
| case Hira: return "Hira"; | |||
| case Ital: return "Ital"; | |||
| case Java: return "Java"; | |||
| case Kali: return "Kali"; | |||
| case Kana: return "Kana"; | |||
| case Khar: return "Khar"; | |||
| case Khmr: return "Khmr"; | |||
| case Knda: return "Knda"; | |||
| case Kthi: return "Kthi"; | |||
| case Lana: return "Lana"; | |||
| case Laoo: return "Laoo"; | |||
| case Latn: return "Latn"; | |||
| case Lepc: return "Lepc"; | |||
| case Limb: return "Limb"; | |||
| case Linb: return "Linb"; | |||
| case Lisu: return "Lisu"; | |||
| case Lyci: return "Lyci"; | |||
| case Lydi: return "Lydi"; | |||
| case Mand: return "Mand"; | |||
| case Merc: return "Merc"; | |||
| case Mero: return "Mero"; | |||
| case Mlym: return "Mlym"; | |||
| case Mong: return "Mong"; | |||
| case Mtei: return "Mtei"; | |||
| case Mymr: return "Mymr"; | |||
| case Nkoo: return "Nkoo"; | |||
| case Ogam: return "Ogam"; | |||
| case Olck: return "Olck"; | |||
| case Orkh: return "Orkh"; | |||
| case Orya: return "Orya"; | |||
| case Osma: return "Osma"; | |||
| case Phag: return "Phag"; | |||
| case Phli: return "Phli"; | |||
| case Phnx: return "Phnx"; | |||
| case Plrd: return "Plrd"; | |||
| case Prti: return "Prti"; | |||
| case Rjng: return "Rjng"; | |||
| case Runr: return "Runr"; | |||
| case Samr: return "Samr"; | |||
| case Sarb: return "Sarb"; | |||
| case Saur: return "Saur"; | |||
| case Shaw: return "Shaw"; | |||
| case Shrd: return "Shrd"; | |||
| case Sinh: return "Sinh"; | |||
| case Sora: return "Sora"; | |||
| case Sund: return "Sund"; | |||
| case Sylo: return "Sylo"; | |||
| case Syrn: return "Syrn"; | |||
| case Tagb: return "Tagb"; | |||
| case Takr: return "Takr"; | |||
| case Tale: return "Tale"; | |||
| case Talu: return "Talu"; | |||
| case Taml: return "Taml"; | |||
| case Tavt: return "Tavt"; | |||
| case Telu: return "Telu"; | |||
| case Tfng: return "Tfng"; | |||
| case Tglg: return "Tglg"; | |||
| case Thaa: return "Thaa"; | |||
| case Thai: return "Thai"; | |||
| case Tibt: return "Tibt"; | |||
| case Ugar: return "Ugar"; | |||
| case Vaii: return "Vaii"; | |||
| case Xpeo: return "Xpeo"; | |||
| case Xsux: return "Xsux"; | |||
| case Yiii: return "Yiii"; | |||
| case Zyyy: return "Zyyy"; | |||
| case Zzzz: return "Zzzz"; | |||
| default: return "----"; | |||
| } | |||
| } | |||
| int main() | |||
| { | |||
| for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
| { | |||
| const char *script = get_script_string(ucd::lookup_script(c)); | |||
| const char *category = get_category_string(ucd::lookup_category(c)); | |||
| const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | |||
| ucd::codepoint_t upper = ucd::toupper(c); | |||
| ucd::codepoint_t lower = ucd::tolower(c); | |||
| ucd::codepoint_t title = ucd::totitle(c); | |||
| const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | |||
| printf("%06X %s %s %06X %06X %06X %s\n", | |||
| c, category_group, category, | |||
| printf("%06X %s %s %s %06X %06X %06X %s\n", | |||
| c, script, category_group, category, | |||
| upper, lower, title, | |||
| whitespace); | |||
| } | |||
| @@ -0,0 +1,102 @@ | |||
| #!/usr/bin/python | |||
| # Copyright (C) 2012 Reece H. Dunn | |||
| # | |||
| # This file is part of ucd-tools. | |||
| # | |||
| # ucd-tools is free software: you can redistribute it and/or modify | |||
| # it under the terms of the GNU General Public License as published by | |||
| # the Free Software Foundation, either version 3 of the License, or | |||
| # (at your option) any later version. | |||
| # | |||
| # ucd-tools is distributed in the hope that it will be useful, | |||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| # GNU General Public License for more details. | |||
| # | |||
| # You should have received a copy of the GNU General Public License | |||
| # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
| import os | |||
| def read_data(path, split_char=':'): | |||
| with open(path) as f: | |||
| for line in f: | |||
| line = line.replace('\n', '') | |||
| if not line.startswith('#'): | |||
| yield line.split(split_char) | |||
| def fold_lines(path): | |||
| next_line = None | |||
| with open(path) as f: | |||
| for line in f: | |||
| line = line.replace('\n', '') | |||
| if line.startswith(' '): | |||
| next_line = '%s%s' % (next_line, line[1:]) | |||
| continue | |||
| if next_line: | |||
| yield next_line | |||
| next_line = line | |||
| def iana_subtag_entries(path): | |||
| tag = {} | |||
| for line in fold_lines(path): | |||
| if line == '%%': | |||
| if 'Type' in tag: | |||
| yield tag | |||
| tag = {} | |||
| continue | |||
| packed = line.split(': ') | |||
| key = packed[0] | |||
| value = ': '.join(packed[1:]) | |||
| if key == 'Description': | |||
| # Only select the first Description. This handles subtag codes | |||
| # that have multiple descriptions (e.g. 'es' maps to "Spanish" | |||
| # and "Castilian"). | |||
| if not key in tag.keys(): | |||
| tag[key] = value | |||
| else: | |||
| tag[key] = value | |||
| yield tag | |||
| typemap = { | |||
| 'extlang': 'ExtLang', | |||
| 'grandfathered': 'Grandfathered', | |||
| 'language': 'Language', | |||
| 'redundant': 'Redundant', | |||
| 'region': 'Region', | |||
| 'script': 'Script', | |||
| 'variant': 'Variant', | |||
| } | |||
| scopemap = { | |||
| 'collection': 'Collection', | |||
| 'macrolanguage': 'MacroLanguage', | |||
| 'special': 'Special', | |||
| 'private-use': 'PrivateUse', | |||
| } | |||
| def read_iana_subtags(path): | |||
| tags = {} | |||
| for tag in iana_subtag_entries(path): | |||
| if 'Subtag' in tag.keys(): | |||
| ref = tag['Subtag'] | |||
| del tag['Subtag'] | |||
| else: | |||
| ref = tag['Tag'] | |||
| del tag['Tag'] | |||
| if 'Scope' in tag.keys(): | |||
| if tag['Type'] != 'language': | |||
| raise Exception('"Scope" property unexpected for Type="%s"' % tag['Type']) | |||
| tag['Type'] = scopemap[ tag['Scope'] ] | |||
| del tag['Scope'] | |||
| else: | |||
| tag['Type'] = typemap[ tag['Type'] ] | |||
| if '..' not in ref: # exclude private use definitions | |||
| tags[ref] = tag | |||
| return tags | |||
| @@ -32,21 +32,29 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||
| if data['Property'] in ['White_Space']: | |||
| for codepoint in data['Range']: | |||
| unicode_chars[codepoint]['Properties'].append(data['Property']) | |||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
| for codepoint in data['Range']: | |||
| unicode_chars[codepoint]['Script'] = data['Script'] | |||
| null = ucd.CodePoint('0000') | |||
| if __name__ == '__main__': | |||
| for codepoint in ucd.CodeRange('000000..10FFFF'): | |||
| try: | |||
| data = unicode_chars[codepoint] | |||
| title = data['TitleCase'] | |||
| upper = data['UpperCase'] | |||
| lower = data['LowerCase'] | |||
| if title == null: title = codepoint | |||
| if upper == null: upper = codepoint | |||
| if lower == null: lower = codepoint | |||
| print '%s %s %s %s %s %s %s' % ( | |||
| codepoint, data['GeneralCategory'][0], data['GeneralCategory'], | |||
| upper, lower, title, | |||
| ' '.join(data['Properties'])) | |||
| except KeyError: | |||
| print '%s C Cn %s %s %s ' % (codepoint, codepoint, codepoint, codepoint) | |||
| data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []} | |||
| try: | |||
| script = data['Script'] | |||
| except KeyError: | |||
| script = 'Zzzz' | |||
| title = data['TitleCase'] | |||
| upper = data['UpperCase'] | |||
| lower = data['LowerCase'] | |||
| if title == null: title = codepoint | |||
| if upper == null: upper = codepoint | |||
| if lower == null: lower = codepoint | |||
| print '%s %s %s %s %s %s %s %s' % ( | |||
| codepoint, script, | |||
| data['GeneralCategory'][0], data['GeneralCategory'], | |||
| upper, lower, title, | |||
| ' '.join(data['Properties'])) | |||
| @@ -0,0 +1,172 @@ | |||
| #!/usr/bin/python | |||
| # Copyright (C) 2012 Reece H. Dunn | |||
| # | |||
| # This file is part of ucd-tools. | |||
| # | |||
| # ucd-tools is free software: you can redistribute it and/or modify | |||
| # it under the terms of the GNU General Public License as published by | |||
| # the Free Software Foundation, either version 3 of the License, or | |||
| # (at your option) any later version. | |||
| # | |||
| # ucd-tools is distributed in the hope that it will be useful, | |||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| # GNU General Public License for more details. | |||
| # | |||
| # You should have received a copy of the GNU General Public License | |||
| # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
| import os | |||
| import sys | |||
| import ucd | |||
| ucd_rootdir = sys.argv[1] | |||
| ucd_version = sys.argv[2] | |||
| unicode_chars = {} | |||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
| for codepoint in data['Range']: | |||
| unicode_chars[codepoint] = data['Script'] | |||
| # This map is a combination of the information in the UnicodeData and Blocks | |||
| # data files. It is intended to reduce the number of character tables that | |||
| # need to be generated. | |||
| script_sets = [ | |||
| (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'), | |||
| (ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'), | |||
| (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||
| (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
| (ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'), | |||
| (ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'), | |||
| (ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'), | |||
| (ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'), | |||
| (ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'), | |||
| ] | |||
| # These scripts have many pages consisting of just this script: | |||
| special_scripts = [] | |||
| script_tables = {} | |||
| for codepoints, script, comment in script_sets: | |||
| if not script: | |||
| table = {} | |||
| table_entry = None | |||
| table_codepoint = None | |||
| table_script = None | |||
| for i, codepoint in enumerate(codepoints): | |||
| try: | |||
| script = unicode_chars[codepoint] | |||
| except KeyError: | |||
| script = 'Zzzz' # Unknown | |||
| if (i % 256) == 0: | |||
| if table_entry: | |||
| if table_script in special_scripts: | |||
| table[table_codepoint] = table_script | |||
| elif table_script: | |||
| special_scripts.append(table_script) | |||
| table[table_codepoint] = table_script | |||
| else: | |||
| table[table_codepoint] = table_entry | |||
| table_entry = [] | |||
| table_codepoint = codepoint | |||
| table_script = script | |||
| if script != table_script: | |||
| table_script = None | |||
| table_entry.append(script) | |||
| if table_entry: | |||
| if table_script in special_scripts: | |||
| table[table_codepoint] = table_script | |||
| else: | |||
| table[table_codepoint] = table_entry | |||
| script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table | |||
| if __name__ == '__main__': | |||
| sys.stdout.write("""/* Unicode Scripts | |||
| * | |||
| * Copyright (C) 2012 Reece H. Dunn | |||
| * | |||
| * This file is part of ucd-tools. | |||
| * | |||
| * ucd-tools is free software: you can redistribute it and/or modify | |||
| * it under the terms of the GNU General Public License as published by | |||
| * the Free Software Foundation, either version 3 of the License, or | |||
| * (at your option) any later version. | |||
| * | |||
| * ucd-tools is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| * GNU General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU General Public License | |||
| * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
| */ | |||
| // NOTE: This file is automatically generated from the Scripts.txt file in | |||
| // the Unicode Character database by the ucd-tools/tools/scripts.py script. | |||
| #include "ucd/ucd.h" | |||
| #include <stddef.h> | |||
| using namespace ucd; | |||
| // Unicode Character Data %s | |||
| """ % ucd_version) | |||
| for script in special_scripts: | |||
| sys.stdout.write('\n') | |||
| sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) | |||
| sys.stdout.write('{') | |||
| for i in range(0, 256): | |||
| if (i % 16) == 0: | |||
| sys.stdout.write('\n\t/* %02X */' % i) | |||
| sys.stdout.write(' %s,' % script) | |||
| sys.stdout.write('\n};\n') | |||
| for codepoints, script, comment in script_sets: | |||
| if not script: | |||
| tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] | |||
| for codepoint in sorted(tables.keys()): | |||
| table = tables[codepoint] | |||
| if table in special_scripts: | |||
| continue | |||
| sys.stdout.write('\n') | |||
| sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) | |||
| sys.stdout.write('{') | |||
| for i, script in enumerate(table): | |||
| if (i % 16) == 0: | |||
| sys.stdout.write('\n\t/* %02X */' % i) | |||
| sys.stdout.write(' %s,' % script) | |||
| sys.stdout.write('\n};\n') | |||
| for codepoints, script, comment in script_sets: | |||
| if not script: | |||
| table_index = '%s_%s' % (codepoints.first, codepoints.last) | |||
| sys.stdout.write('\n') | |||
| sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) | |||
| sys.stdout.write('{\n') | |||
| for codepoint, table in sorted(script_tables[table_index].items()): | |||
| if isinstance(table, str): | |||
| sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint)) | |||
| else: | |||
| sys.stdout.write('\tscripts_%s,\n' % codepoint) | |||
| sys.stdout.write('};\n') | |||
| sys.stdout.write('\n') | |||
| sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n') | |||
| sys.stdout.write('{\n') | |||
| for codepoints, script, comment in script_sets: | |||
| if script: | |||
| sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment)) | |||
| else: | |||
| sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) | |||
| sys.stdout.write('\t{\n') | |||
| sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) | |||
| sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n') | |||
| sys.stdout.write('\t}\n') | |||
| sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n') | |||
| sys.stdout.write('}\n') | |||
| @@ -19,6 +19,30 @@ | |||
| import os | |||
| import sys | |||
| import iana | |||
| script_map = { | |||
| # UCD script names not derivable from IANA script tags: | |||
| 'Canadian_Aboriginal': 'Cans', | |||
| 'Common': 'Zyyy', | |||
| 'Egyptian_Hieroglyphs': 'Egyp', | |||
| 'Inherited': 'Zyyy', | |||
| 'Meetei_Mayek': 'Mtei', | |||
| 'Nko': 'Nkoo', | |||
| 'Phags_Pa': 'Phag', | |||
| # Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA: | |||
| 'Cuneiform': 'Xsux', | |||
| } | |||
| for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items(): | |||
| if tag['Type'] == 'Script': | |||
| # Convert the IANA scipt tag descriptions to the UCD script names: | |||
| desc = tag['Description'] | |||
| if ' (' in desc: | |||
| desc = desc.split(' (')[0] | |||
| desc = desc.replace(' ', '_') | |||
| script_map[desc] = ref | |||
| # Fix up incorrectly mapped script names: | |||
| script_map['Cyrillic'] = 'Cyrl' | |||
| class CodePoint: | |||
| def __init__(self, x): | |||
| @@ -86,6 +110,9 @@ def boolean(x): | |||
| return True | |||
| return False | |||
| def script(x): | |||
| return script_map[x] | |||
| data_items = { | |||
| 'Blocks': [ | |||
| ('Range', codepoint), | |||
| @@ -101,7 +128,7 @@ data_items = { | |||
| ], | |||
| 'Scripts': [ | |||
| ('Range', codepoint), | |||
| ('Script', str), | |||
| ('Script', script), | |||
| ], | |||
| 'UnicodeData': [ | |||
| ('CodePoint', codepoint), | |||