UCD_VERSION=6.2.0 | UCD_VERSION=6.2.0 | ||||
UCD_ROOTDIR=data/ucd | UCD_ROOTDIR=data/ucd | ||||
data/language-subtag-registry: | |||||
mkdir -pv data | |||||
wget -O $@ http://www.iana.org/assignments/language-subtag-registry | |||||
data/ucd/PropList.txt: | data/ucd/PropList.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | ||||
data/ucd/Scripts.txt: | |||||
mkdir -pv data/ucd | |||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt | |||||
data/ucd/UnicodeData.txt: | data/ucd/UnicodeData.txt: | ||||
mkdir -pv data/ucd | mkdir -pv data/ucd | ||||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | ||||
############################# libucd ########################################## | ############################# libucd ########################################## | ||||
src/case.cpp: tools/case.py tools/ucd.py data/ucd/UnicodeData.txt | |||||
src/case.cpp: tools/case.py tools/ucd.py \ | |||||
data/ucd/UnicodeData.txt | |||||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
src/categories.cpp: tools/categories.py tools/ucd.py data/ucd/UnicodeData.txt | |||||
src/categories.cpp: tools/categories.py tools/ucd.py \ | |||||
data/ucd/UnicodeData.txt | |||||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | ||||
src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||||
data/language-subtag-registry \ | |||||
data/ucd/Scripts.txt | |||||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||||
libucd_includedir = $(includedir)/ucd | libucd_includedir = $(includedir)/ucd | ||||
libucd_include_HEADERS = \ | libucd_include_HEADERS = \ | ||||
src/include/ucd/ucd.h | src/include/ucd/ucd.h | ||||
src_libucd_la_SOURCES = \ | src_libucd_la_SOURCES = \ | ||||
src/case.cpp \ | src/case.cpp \ | ||||
src/categories.cpp \ | src/categories.cpp \ | ||||
src/ctype.cpp | |||||
src/ctype.cpp \ | |||||
src/scripts.cpp | |||||
############################# tests ########################################### | ############################# tests ########################################### | ||||
category lookup_category(codepoint_t c); | category lookup_category(codepoint_t c); | ||||
//@} | |||||
/** @name Unicode Script | |||||
* @brief These functions query the Script property of Unicode codepoints. | |||||
*/ | |||||
//@{ | |||||
/** @brief Unicode Script | |||||
* @see http://www.iana.org/assignments/language-subtag-registry | |||||
* @see http://www.unicode.org/iso15924/iso15924-codes.html | |||||
*/ | |||||
enum script | |||||
{ | |||||
Arab, /**< @brief Arabic Script */ | |||||
Armi, /**< @brief Imperial Aramaic Script */ | |||||
Armn, /**< @brief Armenian Script */ | |||||
Avst, /**< @brief Avestan Script */ | |||||
Bali, /**< @brief Balinese Script */ | |||||
Bamu, /**< @brief Bamum Script */ | |||||
Batk, /**< @brief Batak Script */ | |||||
Beng, /**< @brief Bengali Script */ | |||||
Bopo, /**< @brief Bopomofo Script */ | |||||
Brah, /**< @brief Brahmi Script */ | |||||
Brai, /**< @brief Braille Script */ | |||||
Bugi, /**< @brief Buginese Script */ | |||||
Buhd, /**< @brief Buhid Script */ | |||||
Cans, /**< @brief Unified Canadian Aboriginal Syllabics */ | |||||
Cari, /**< @brief Carian Script */ | |||||
Cakm, /**< @brief Chakma Script */ | |||||
Cham, /**< @brief Cham Script */ | |||||
Cher, /**< @brief Cherokee Script */ | |||||
Copt, /**< @brief Coptic Script */ | |||||
Cprt, /**< @brief Cypriot Script */ | |||||
Cyrl, /**< @brief Cyrillic Script */ | |||||
Deva, /**< @brief Devanagari Script */ | |||||
Dsrt, /**< @brief Deseret Script */ | |||||
Egyp, /**< @brief Egyptian Hiegoglyphs */ | |||||
Ethi, /**< @brief Ethiopic Script */ | |||||
Geor, /**< @brief Geirgian Script */ | |||||
Glag, /**< @brief Glagolitic Script */ | |||||
Goth, /**< @brief Gothic Script */ | |||||
Grek, /**< @brief Greek Script */ | |||||
Gujr, /**< @brief Gujarati Script */ | |||||
Guru, /**< @brief Gurmukhi Script */ | |||||
Hang, /**< @brief Hangul Script */ | |||||
Hano, /**< @brief Hanunoo Script */ | |||||
Hant, /**< @brief Han (Traditional) Script */ | |||||
Hebr, /**< @brief Hebrew Script */ | |||||
Hira, /**< @brief Hiragana Script */ | |||||
Ital, /**< @brief Old Italic Script */ | |||||
Java, /**< @brief Javanese Script */ | |||||
Kali, /**< @brief Kayah Li Script */ | |||||
Kana, /**< @brief Katakana Script */ | |||||
Khar, /**< @brief Kharoshthi Script */ | |||||
Khmr, /**< @brief Khmer Script */ | |||||
Knda, /**< @brief Kannada Script */ | |||||
Kthi, /**< @brief Kaithi Script */ | |||||
Lana, /**< @brief Tai Tham Script */ | |||||
Laoo, /**< @brief Lao Script */ | |||||
Latn, /**< @brief Latin Script */ | |||||
Lepc, /**< @brief Lepcha Script */ | |||||
Limb, /**< @brief Limbu Script */ | |||||
Linb, /**< @brief Linear B Script */ | |||||
Lisu, /**< @brief Lisu Script */ | |||||
Lyci, /**< @brief Lycian Script */ | |||||
Lydi, /**< @brief Lydian Script */ | |||||
Mand, /**< @brief Mandaic Script */ | |||||
Merc, /**< @brief Meroitic Cursive Script */ | |||||
Mero, /**< @brief Meroitic Hieroglyphs */ | |||||
Mlym, /**< @brief Malayalam Script */ | |||||
Mong, /**< @brief Mongolian Script */ | |||||
Mtei, /**< @brief Meitei Mayek Script */ | |||||
Mymr, /**< @brief Myanmar Script */ | |||||
Nkoo, /**< @brief N'Ko Script */ | |||||
Ogam, /**< @brief Ogham Script */ | |||||
Olck, /**< @brief Ol Chiki Script */ | |||||
Orkh, /**< @brief Old Turkic Script */ | |||||
Orya, /**< @brief Oriya Script */ | |||||
Osma, /**< @brief Osmanya Script */ | |||||
Phag, /**< @brief Phags-Pa Script */ | |||||
Phli, /**< @brief Inscriptional Pahlavi Script */ | |||||
Phnx, /**< @brief Phoenician Script */ | |||||
Plrd, /**< @brief Miao Script */ | |||||
Prti, /**< @brief Inscriptional Parthian Script */ | |||||
Rjng, /**< @brief Rejang Script */ | |||||
Runr, /**< @brief Runic Script */ | |||||
Samr, /**< @brief Samaritan Script */ | |||||
Sarb, /**< @brief Old South Arabian Script */ | |||||
Saur, /**< @brief Saurashtra Script */ | |||||
Shaw, /**< @brief Shavian Script */ | |||||
Shrd, /**< @brief Sharada Script */ | |||||
Sinh, /**< @brief Sinhala Script */ | |||||
Sora, /**< @brief Sora Sompeng Script */ | |||||
Sund, /**< @brief Sundanese Script */ | |||||
Sylo, /**< @brief Syloti Nagri Script */ | |||||
Syrn, /**< @brief Syriatic (Eastern) Script */ | |||||
Tagb, /**< @brief Tagbanwa Script */ | |||||
Takr, /**< @brief Takri Script */ | |||||
Tale, /**< @brief Tai Le Script */ | |||||
Talu, /**< @brief New Tai Lue Script */ | |||||
Taml, /**< @brief Tamil Script */ | |||||
Tavt, /**< @brief Tai Viet Script */ | |||||
Telu, /**< @brief Telugu Script */ | |||||
Tfng, /**< @brief Tifinagh Script */ | |||||
Tglg, /**< @brief Tagalog Script */ | |||||
Thaa, /**< @brief Thaana Script */ | |||||
Thai, /**< @brief Thai Script */ | |||||
Tibt, /**< @brief Tibetan Script */ | |||||
Ugar, /**< @brief Ugaritic Script */ | |||||
Vaii, /**< @brief Vai Script */ | |||||
Xpeo, /**< @brief Old Persian Script */ | |||||
Xsux, /**< @brief Cuneiform Script */ | |||||
Yiii, /**< @brief Yi Script */ | |||||
Zyyy, /**< @brief Inherited Script */ | |||||
Zzzz, /**< @brief Unknown Script */ | |||||
}; | |||||
/** @brief Lookup the Script for a Unicode codepoint. | |||||
* | |||||
* @param c The Unicode codepoint to lookup. | |||||
* @return The Script of the Unicode codepoint. | |||||
*/ | |||||
script lookup_script(codepoint_t c); | |||||
//@} | //@} | ||||
/** @name ctype-style APIs | /** @name ctype-style APIs | ||||
* @brief These functions provide wctype compatible functions using the UCD data. | * @brief These functions provide wctype compatible functions using the UCD data. |
} | } | ||||
} | } | ||||
const char *get_script_string(ucd::script s) | |||||
{ | |||||
using namespace ucd; | |||||
switch (s) | |||||
{ | |||||
case Arab: return "Arab"; | |||||
case Armi: return "Armi"; | |||||
case Armn: return "Armn"; | |||||
case Avst: return "Avst"; | |||||
case Bali: return "Bali"; | |||||
case Bamu: return "Bamu"; | |||||
case Batk: return "Batk"; | |||||
case Beng: return "Beng"; | |||||
case Bopo: return "Bopo"; | |||||
case Brah: return "Brah"; | |||||
case Brai: return "Brai"; | |||||
case Bugi: return "Bugi"; | |||||
case Buhd: return "Buhd"; | |||||
case Cans: return "Cans"; | |||||
case Cari: return "Cari"; | |||||
case Cakm: return "Cakm"; | |||||
case Cham: return "Cham"; | |||||
case Cher: return "Cher"; | |||||
case Copt: return "Copt"; | |||||
case Cprt: return "Cprt"; | |||||
case Cyrl: return "Cyrl"; | |||||
case Deva: return "Deva"; | |||||
case Dsrt: return "Dsrt"; | |||||
case Egyp: return "Egyp"; | |||||
case Ethi: return "Ethi"; | |||||
case Geor: return "Geor"; | |||||
case Glag: return "Glag"; | |||||
case Goth: return "Goth"; | |||||
case Grek: return "Grek"; | |||||
case Gujr: return "Gujr"; | |||||
case Guru: return "Guru"; | |||||
case Hang: return "Hang"; | |||||
case Hano: return "Hano"; | |||||
case Hant: return "Hant"; | |||||
case Hebr: return "Hebr"; | |||||
case Hira: return "Hira"; | |||||
case Ital: return "Ital"; | |||||
case Java: return "Java"; | |||||
case Kali: return "Kali"; | |||||
case Kana: return "Kana"; | |||||
case Khar: return "Khar"; | |||||
case Khmr: return "Khmr"; | |||||
case Knda: return "Knda"; | |||||
case Kthi: return "Kthi"; | |||||
case Lana: return "Lana"; | |||||
case Laoo: return "Laoo"; | |||||
case Latn: return "Latn"; | |||||
case Lepc: return "Lepc"; | |||||
case Limb: return "Limb"; | |||||
case Linb: return "Linb"; | |||||
case Lisu: return "Lisu"; | |||||
case Lyci: return "Lyci"; | |||||
case Lydi: return "Lydi"; | |||||
case Mand: return "Mand"; | |||||
case Merc: return "Merc"; | |||||
case Mero: return "Mero"; | |||||
case Mlym: return "Mlym"; | |||||
case Mong: return "Mong"; | |||||
case Mtei: return "Mtei"; | |||||
case Mymr: return "Mymr"; | |||||
case Nkoo: return "Nkoo"; | |||||
case Ogam: return "Ogam"; | |||||
case Olck: return "Olck"; | |||||
case Orkh: return "Orkh"; | |||||
case Orya: return "Orya"; | |||||
case Osma: return "Osma"; | |||||
case Phag: return "Phag"; | |||||
case Phli: return "Phli"; | |||||
case Phnx: return "Phnx"; | |||||
case Plrd: return "Plrd"; | |||||
case Prti: return "Prti"; | |||||
case Rjng: return "Rjng"; | |||||
case Runr: return "Runr"; | |||||
case Samr: return "Samr"; | |||||
case Sarb: return "Sarb"; | |||||
case Saur: return "Saur"; | |||||
case Shaw: return "Shaw"; | |||||
case Shrd: return "Shrd"; | |||||
case Sinh: return "Sinh"; | |||||
case Sora: return "Sora"; | |||||
case Sund: return "Sund"; | |||||
case Sylo: return "Sylo"; | |||||
case Syrn: return "Syrn"; | |||||
case Tagb: return "Tagb"; | |||||
case Takr: return "Takr"; | |||||
case Tale: return "Tale"; | |||||
case Talu: return "Talu"; | |||||
case Taml: return "Taml"; | |||||
case Tavt: return "Tavt"; | |||||
case Telu: return "Telu"; | |||||
case Tfng: return "Tfng"; | |||||
case Tglg: return "Tglg"; | |||||
case Thaa: return "Thaa"; | |||||
case Thai: return "Thai"; | |||||
case Tibt: return "Tibt"; | |||||
case Ugar: return "Ugar"; | |||||
case Vaii: return "Vaii"; | |||||
case Xpeo: return "Xpeo"; | |||||
case Xsux: return "Xsux"; | |||||
case Yiii: return "Yiii"; | |||||
case Zyyy: return "Zyyy"; | |||||
case Zzzz: return "Zzzz"; | |||||
default: return "----"; | |||||
} | |||||
} | |||||
int main() | int main() | ||||
{ | { | ||||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | ||||
{ | { | ||||
const char *script = get_script_string(ucd::lookup_script(c)); | |||||
const char *category = get_category_string(ucd::lookup_category(c)); | const char *category = get_category_string(ucd::lookup_category(c)); | ||||
const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | ||||
ucd::codepoint_t upper = ucd::toupper(c); | ucd::codepoint_t upper = ucd::toupper(c); | ||||
ucd::codepoint_t lower = ucd::tolower(c); | ucd::codepoint_t lower = ucd::tolower(c); | ||||
ucd::codepoint_t title = ucd::totitle(c); | ucd::codepoint_t title = ucd::totitle(c); | ||||
const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | ||||
printf("%06X %s %s %06X %06X %06X %s\n", | |||||
c, category_group, category, | |||||
printf("%06X %s %s %s %06X %06X %06X %s\n", | |||||
c, script, category_group, category, | |||||
upper, lower, title, | upper, lower, title, | ||||
whitespace); | whitespace); | ||||
} | } |
#!/usr/bin/python | |||||
# Copyright (C) 2012 Reece H. Dunn | |||||
# | |||||
# This file is part of ucd-tools. | |||||
# | |||||
# ucd-tools is free software: you can redistribute it and/or modify | |||||
# it under the terms of the GNU General Public License as published by | |||||
# the Free Software Foundation, either version 3 of the License, or | |||||
# (at your option) any later version. | |||||
# | |||||
# ucd-tools is distributed in the hope that it will be useful, | |||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
# GNU General Public License for more details. | |||||
# | |||||
# You should have received a copy of the GNU General Public License | |||||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
import os | |||||
def read_data(path, split_char=':'): | |||||
with open(path) as f: | |||||
for line in f: | |||||
line = line.replace('\n', '') | |||||
if not line.startswith('#'): | |||||
yield line.split(split_char) | |||||
def fold_lines(path): | |||||
next_line = None | |||||
with open(path) as f: | |||||
for line in f: | |||||
line = line.replace('\n', '') | |||||
if line.startswith(' '): | |||||
next_line = '%s%s' % (next_line, line[1:]) | |||||
continue | |||||
if next_line: | |||||
yield next_line | |||||
next_line = line | |||||
def iana_subtag_entries(path): | |||||
tag = {} | |||||
for line in fold_lines(path): | |||||
if line == '%%': | |||||
if 'Type' in tag: | |||||
yield tag | |||||
tag = {} | |||||
continue | |||||
packed = line.split(': ') | |||||
key = packed[0] | |||||
value = ': '.join(packed[1:]) | |||||
if key == 'Description': | |||||
# Only select the first Description. This handles subtag codes | |||||
# that have multiple descriptions (e.g. 'es' maps to "Spanish" | |||||
# and "Castilian"). | |||||
if not key in tag.keys(): | |||||
tag[key] = value | |||||
else: | |||||
tag[key] = value | |||||
yield tag | |||||
typemap = { | |||||
'extlang': 'ExtLang', | |||||
'grandfathered': 'Grandfathered', | |||||
'language': 'Language', | |||||
'redundant': 'Redundant', | |||||
'region': 'Region', | |||||
'script': 'Script', | |||||
'variant': 'Variant', | |||||
} | |||||
scopemap = { | |||||
'collection': 'Collection', | |||||
'macrolanguage': 'MacroLanguage', | |||||
'special': 'Special', | |||||
'private-use': 'PrivateUse', | |||||
} | |||||
def read_iana_subtags(path): | |||||
tags = {} | |||||
for tag in iana_subtag_entries(path): | |||||
if 'Subtag' in tag.keys(): | |||||
ref = tag['Subtag'] | |||||
del tag['Subtag'] | |||||
else: | |||||
ref = tag['Tag'] | |||||
del tag['Tag'] | |||||
if 'Scope' in tag.keys(): | |||||
if tag['Type'] != 'language': | |||||
raise Exception('"Scope" property unexpected for Type="%s"' % tag['Type']) | |||||
tag['Type'] = scopemap[ tag['Scope'] ] | |||||
del tag['Scope'] | |||||
else: | |||||
tag['Type'] = typemap[ tag['Type'] ] | |||||
if '..' not in ref: # exclude private use definitions | |||||
tags[ref] = tag | |||||
return tags |
if data['Property'] in ['White_Space']: | if data['Property'] in ['White_Space']: | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint]['Properties'].append(data['Property']) | unicode_chars[codepoint]['Properties'].append(data['Property']) | ||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||||
for codepoint in data['Range']: | |||||
unicode_chars[codepoint]['Script'] = data['Script'] | |||||
null = ucd.CodePoint('0000') | null = ucd.CodePoint('0000') | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
for codepoint in ucd.CodeRange('000000..10FFFF'): | for codepoint in ucd.CodeRange('000000..10FFFF'): | ||||
try: | try: | ||||
data = unicode_chars[codepoint] | data = unicode_chars[codepoint] | ||||
title = data['TitleCase'] | |||||
upper = data['UpperCase'] | |||||
lower = data['LowerCase'] | |||||
if title == null: title = codepoint | |||||
if upper == null: upper = codepoint | |||||
if lower == null: lower = codepoint | |||||
print '%s %s %s %s %s %s %s' % ( | |||||
codepoint, data['GeneralCategory'][0], data['GeneralCategory'], | |||||
upper, lower, title, | |||||
' '.join(data['Properties'])) | |||||
except KeyError: | except KeyError: | ||||
print '%s C Cn %s %s %s ' % (codepoint, codepoint, codepoint, codepoint) | |||||
data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []} | |||||
try: | |||||
script = data['Script'] | |||||
except KeyError: | |||||
script = 'Zzzz' | |||||
title = data['TitleCase'] | |||||
upper = data['UpperCase'] | |||||
lower = data['LowerCase'] | |||||
if title == null: title = codepoint | |||||
if upper == null: upper = codepoint | |||||
if lower == null: lower = codepoint | |||||
print '%s %s %s %s %s %s %s %s' % ( | |||||
codepoint, script, | |||||
data['GeneralCategory'][0], data['GeneralCategory'], | |||||
upper, lower, title, | |||||
' '.join(data['Properties'])) |
#!/usr/bin/python | |||||
# Copyright (C) 2012 Reece H. Dunn | |||||
# | |||||
# This file is part of ucd-tools. | |||||
# | |||||
# ucd-tools is free software: you can redistribute it and/or modify | |||||
# it under the terms of the GNU General Public License as published by | |||||
# the Free Software Foundation, either version 3 of the License, or | |||||
# (at your option) any later version. | |||||
# | |||||
# ucd-tools is distributed in the hope that it will be useful, | |||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
# GNU General Public License for more details. | |||||
# | |||||
# You should have received a copy of the GNU General Public License | |||||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
import os | |||||
import sys | |||||
import ucd | |||||
ucd_rootdir = sys.argv[1] | |||||
ucd_version = sys.argv[2] | |||||
unicode_chars = {} | |||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||||
for codepoint in data['Range']: | |||||
unicode_chars[codepoint] = data['Script'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | |||||
# data files. It is intended to reduce the number of character tables that | |||||
# need to be generated. | |||||
script_sets = [ | |||||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'), | |||||
(ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'), | |||||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||||
(ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'), | |||||
(ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'), | |||||
(ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'), | |||||
(ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'), | |||||
(ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'), | |||||
] | |||||
# These scripts have many pages consisting of just this script: | |||||
special_scripts = [] | |||||
script_tables = {} | |||||
for codepoints, script, comment in script_sets: | |||||
if not script: | |||||
table = {} | |||||
table_entry = None | |||||
table_codepoint = None | |||||
table_script = None | |||||
for i, codepoint in enumerate(codepoints): | |||||
try: | |||||
script = unicode_chars[codepoint] | |||||
except KeyError: | |||||
script = 'Zzzz' # Unknown | |||||
if (i % 256) == 0: | |||||
if table_entry: | |||||
if table_script in special_scripts: | |||||
table[table_codepoint] = table_script | |||||
elif table_script: | |||||
special_scripts.append(table_script) | |||||
table[table_codepoint] = table_script | |||||
else: | |||||
table[table_codepoint] = table_entry | |||||
table_entry = [] | |||||
table_codepoint = codepoint | |||||
table_script = script | |||||
if script != table_script: | |||||
table_script = None | |||||
table_entry.append(script) | |||||
if table_entry: | |||||
if table_script in special_scripts: | |||||
table[table_codepoint] = table_script | |||||
else: | |||||
table[table_codepoint] = table_entry | |||||
script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table | |||||
if __name__ == '__main__': | |||||
sys.stdout.write("""/* Unicode Scripts | |||||
* | |||||
* Copyright (C) 2012 Reece H. Dunn | |||||
* | |||||
* This file is part of ucd-tools. | |||||
* | |||||
* ucd-tools is free software: you can redistribute it and/or modify | |||||
* it under the terms of the GNU General Public License as published by | |||||
* the Free Software Foundation, either version 3 of the License, or | |||||
* (at your option) any later version. | |||||
* | |||||
* ucd-tools is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
* GNU General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU General Public License | |||||
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
*/ | |||||
// NOTE: This file is automatically generated from the Scripts.txt file in | |||||
// the Unicode Character database by the ucd-tools/tools/scripts.py script. | |||||
#include "ucd/ucd.h" | |||||
#include <stddef.h> | |||||
using namespace ucd; | |||||
// Unicode Character Data %s | |||||
""" % ucd_version) | |||||
for script in special_scripts: | |||||
sys.stdout.write('\n') | |||||
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) | |||||
sys.stdout.write('{') | |||||
for i in range(0, 256): | |||||
if (i % 16) == 0: | |||||
sys.stdout.write('\n\t/* %02X */' % i) | |||||
sys.stdout.write(' %s,' % script) | |||||
sys.stdout.write('\n};\n') | |||||
for codepoints, script, comment in script_sets: | |||||
if not script: | |||||
tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] | |||||
for codepoint in sorted(tables.keys()): | |||||
table = tables[codepoint] | |||||
if table in special_scripts: | |||||
continue | |||||
sys.stdout.write('\n') | |||||
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) | |||||
sys.stdout.write('{') | |||||
for i, script in enumerate(table): | |||||
if (i % 16) == 0: | |||||
sys.stdout.write('\n\t/* %02X */' % i) | |||||
sys.stdout.write(' %s,' % script) | |||||
sys.stdout.write('\n};\n') | |||||
for codepoints, script, comment in script_sets: | |||||
if not script: | |||||
table_index = '%s_%s' % (codepoints.first, codepoints.last) | |||||
sys.stdout.write('\n') | |||||
sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) | |||||
sys.stdout.write('{\n') | |||||
for codepoint, table in sorted(script_tables[table_index].items()): | |||||
if isinstance(table, str): | |||||
sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint)) | |||||
else: | |||||
sys.stdout.write('\tscripts_%s,\n' % codepoint) | |||||
sys.stdout.write('};\n') | |||||
sys.stdout.write('\n') | |||||
sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n') | |||||
sys.stdout.write('{\n') | |||||
for codepoints, script, comment in script_sets: | |||||
if script: | |||||
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment)) | |||||
else: | |||||
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) | |||||
sys.stdout.write('\t{\n') | |||||
sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) | |||||
sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n') | |||||
sys.stdout.write('\t}\n') | |||||
sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n') | |||||
sys.stdout.write('}\n') |
import os | import os | ||||
import sys | import sys | ||||
import iana | |||||
script_map = { | |||||
# UCD script names not derivable from IANA script tags: | |||||
'Canadian_Aboriginal': 'Cans', | |||||
'Common': 'Zyyy', | |||||
'Egyptian_Hieroglyphs': 'Egyp', | |||||
'Inherited': 'Zyyy', | |||||
'Meetei_Mayek': 'Mtei', | |||||
'Nko': 'Nkoo', | |||||
'Phags_Pa': 'Phag', | |||||
# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA: | |||||
'Cuneiform': 'Xsux', | |||||
} | |||||
for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items(): | |||||
if tag['Type'] == 'Script': | |||||
# Convert the IANA scipt tag descriptions to the UCD script names: | |||||
desc = tag['Description'] | |||||
if ' (' in desc: | |||||
desc = desc.split(' (')[0] | |||||
desc = desc.replace(' ', '_') | |||||
script_map[desc] = ref | |||||
# Fix up incorrectly mapped script names: | |||||
script_map['Cyrillic'] = 'Cyrl' | |||||
class CodePoint: | class CodePoint: | ||||
def __init__(self, x): | def __init__(self, x): | ||||
return True | return True | ||||
return False | return False | ||||
def script(x): | |||||
return script_map[x] | |||||
data_items = { | data_items = { | ||||
'Blocks': [ | 'Blocks': [ | ||||
('Range', codepoint), | ('Range', codepoint), | ||||
], | ], | ||||
'Scripts': [ | 'Scripts': [ | ||||
('Range', codepoint), | ('Range', codepoint), | ||||
('Script', str), | |||||
('Script', script), | |||||
], | ], | ||||
'UnicodeData': [ | 'UnicodeData': [ | ||||
('CodePoint', codepoint), | ('CodePoint', codepoint), |