@@ -57,22 +57,37 @@ EXTRA_DIST += ChangeLog | |||
UCD_VERSION=6.2.0 | |||
UCD_ROOTDIR=data/ucd | |||
data/language-subtag-registry: | |||
mkdir -pv data | |||
wget -O $@ http://www.iana.org/assignments/language-subtag-registry | |||
data/ucd/PropList.txt: | |||
mkdir -pv data/ucd | |||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt | |||
data/ucd/Scripts.txt: | |||
mkdir -pv data/ucd | |||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt | |||
data/ucd/UnicodeData.txt: | |||
mkdir -pv data/ucd | |||
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt | |||
############################# libucd ########################################## | |||
src/case.cpp: tools/case.py tools/ucd.py data/ucd/UnicodeData.txt | |||
src/case.cpp: tools/case.py tools/ucd.py \ | |||
data/ucd/UnicodeData.txt | |||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
src/categories.cpp: tools/categories.py tools/ucd.py data/ucd/UnicodeData.txt | |||
src/categories.cpp: tools/categories.py tools/ucd.py \ | |||
data/ucd/UnicodeData.txt | |||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
src/scripts.cpp: tools/scripts.py tools/ucd.py \ | |||
data/language-subtag-registry \ | |||
data/ucd/Scripts.txt | |||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@ | |||
libucd_includedir = $(includedir)/ucd | |||
libucd_include_HEADERS = \ | |||
src/include/ucd/ucd.h | |||
@@ -83,7 +98,8 @@ src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS} | |||
src_libucd_la_SOURCES = \ | |||
src/case.cpp \ | |||
src/categories.cpp \ | |||
src/ctype.cpp | |||
src/ctype.cpp \ | |||
src/scripts.cpp | |||
############################# tests ########################################### | |||
@@ -118,6 +118,131 @@ namespace ucd | |||
category lookup_category(codepoint_t c); | |||
//@} | |||
/** @name Unicode Script | |||
* @brief These functions query the Script property of Unicode codepoints. | |||
*/ | |||
//@{ | |||
/** @brief Unicode Script | |||
* @see http://www.iana.org/assignments/language-subtag-registry | |||
* @see http://www.unicode.org/iso15924/iso15924-codes.html | |||
*/ | |||
enum script | |||
{ | |||
Arab, /**< @brief Arabic Script */ | |||
Armi, /**< @brief Imperial Aramaic Script */ | |||
Armn, /**< @brief Armenian Script */ | |||
Avst, /**< @brief Avestan Script */ | |||
Bali, /**< @brief Balinese Script */ | |||
Bamu, /**< @brief Bamum Script */ | |||
Batk, /**< @brief Batak Script */ | |||
Beng, /**< @brief Bengali Script */ | |||
Bopo, /**< @brief Bopomofo Script */ | |||
Brah, /**< @brief Brahmi Script */ | |||
Brai, /**< @brief Braille Script */ | |||
Bugi, /**< @brief Buginese Script */ | |||
Buhd, /**< @brief Buhid Script */ | |||
Cans, /**< @brief Unified Canadian Aboriginal Syllabics */ | |||
Cari, /**< @brief Carian Script */ | |||
Cakm, /**< @brief Chakma Script */ | |||
Cham, /**< @brief Cham Script */ | |||
Cher, /**< @brief Cherokee Script */ | |||
Copt, /**< @brief Coptic Script */ | |||
Cprt, /**< @brief Cypriot Script */ | |||
Cyrl, /**< @brief Cyrillic Script */ | |||
Deva, /**< @brief Devanagari Script */ | |||
Dsrt, /**< @brief Deseret Script */ | |||
Egyp, /**< @brief Egyptian Hiegoglyphs */ | |||
Ethi, /**< @brief Ethiopic Script */ | |||
Geor, /**< @brief Geirgian Script */ | |||
Glag, /**< @brief Glagolitic Script */ | |||
Goth, /**< @brief Gothic Script */ | |||
Grek, /**< @brief Greek Script */ | |||
Gujr, /**< @brief Gujarati Script */ | |||
Guru, /**< @brief Gurmukhi Script */ | |||
Hang, /**< @brief Hangul Script */ | |||
Hano, /**< @brief Hanunoo Script */ | |||
Hant, /**< @brief Han (Traditional) Script */ | |||
Hebr, /**< @brief Hebrew Script */ | |||
Hira, /**< @brief Hiragana Script */ | |||
Ital, /**< @brief Old Italic Script */ | |||
Java, /**< @brief Javanese Script */ | |||
Kali, /**< @brief Kayah Li Script */ | |||
Kana, /**< @brief Katakana Script */ | |||
Khar, /**< @brief Kharoshthi Script */ | |||
Khmr, /**< @brief Khmer Script */ | |||
Knda, /**< @brief Kannada Script */ | |||
Kthi, /**< @brief Kaithi Script */ | |||
Lana, /**< @brief Tai Tham Script */ | |||
Laoo, /**< @brief Lao Script */ | |||
Latn, /**< @brief Latin Script */ | |||
Lepc, /**< @brief Lepcha Script */ | |||
Limb, /**< @brief Limbu Script */ | |||
Linb, /**< @brief Linear B Script */ | |||
Lisu, /**< @brief Lisu Script */ | |||
Lyci, /**< @brief Lycian Script */ | |||
Lydi, /**< @brief Lydian Script */ | |||
Mand, /**< @brief Mandaic Script */ | |||
Merc, /**< @brief Meroitic Cursive Script */ | |||
Mero, /**< @brief Meroitic Hieroglyphs */ | |||
Mlym, /**< @brief Malayalam Script */ | |||
Mong, /**< @brief Mongolian Script */ | |||
Mtei, /**< @brief Meitei Mayek Script */ | |||
Mymr, /**< @brief Myanmar Script */ | |||
Nkoo, /**< @brief N'Ko Script */ | |||
Ogam, /**< @brief Ogham Script */ | |||
Olck, /**< @brief Ol Chiki Script */ | |||
Orkh, /**< @brief Old Turkic Script */ | |||
Orya, /**< @brief Oriya Script */ | |||
Osma, /**< @brief Osmanya Script */ | |||
Phag, /**< @brief Phags-Pa Script */ | |||
Phli, /**< @brief Inscriptional Pahlavi Script */ | |||
Phnx, /**< @brief Phoenician Script */ | |||
Plrd, /**< @brief Miao Script */ | |||
Prti, /**< @brief Inscriptional Parthian Script */ | |||
Rjng, /**< @brief Rejang Script */ | |||
Runr, /**< @brief Runic Script */ | |||
Samr, /**< @brief Samaritan Script */ | |||
Sarb, /**< @brief Old South Arabian Script */ | |||
Saur, /**< @brief Saurashtra Script */ | |||
Shaw, /**< @brief Shavian Script */ | |||
Shrd, /**< @brief Sharada Script */ | |||
Sinh, /**< @brief Sinhala Script */ | |||
Sora, /**< @brief Sora Sompeng Script */ | |||
Sund, /**< @brief Sundanese Script */ | |||
Sylo, /**< @brief Syloti Nagri Script */ | |||
Syrn, /**< @brief Syriatic (Eastern) Script */ | |||
Tagb, /**< @brief Tagbanwa Script */ | |||
Takr, /**< @brief Takri Script */ | |||
Tale, /**< @brief Tai Le Script */ | |||
Talu, /**< @brief New Tai Lue Script */ | |||
Taml, /**< @brief Tamil Script */ | |||
Tavt, /**< @brief Tai Viet Script */ | |||
Telu, /**< @brief Telugu Script */ | |||
Tfng, /**< @brief Tifinagh Script */ | |||
Tglg, /**< @brief Tagalog Script */ | |||
Thaa, /**< @brief Thaana Script */ | |||
Thai, /**< @brief Thai Script */ | |||
Tibt, /**< @brief Tibetan Script */ | |||
Ugar, /**< @brief Ugaritic Script */ | |||
Vaii, /**< @brief Vai Script */ | |||
Xpeo, /**< @brief Old Persian Script */ | |||
Xsux, /**< @brief Cuneiform Script */ | |||
Yiii, /**< @brief Yi Script */ | |||
Zyyy, /**< @brief Inherited Script */ | |||
Zzzz, /**< @brief Unknown Script */ | |||
}; | |||
/** @brief Lookup the Script for a Unicode codepoint. | |||
* | |||
* @param c The Unicode codepoint to lookup. | |||
* @return The Script of the Unicode codepoint. | |||
*/ | |||
script lookup_script(codepoint_t c); | |||
//@} | |||
/** @name ctype-style APIs | |||
* @brief These functions provide wctype compatible functions using the UCD data. |
@@ -78,18 +78,130 @@ const char *get_category_string(ucd::category c) | |||
} | |||
} | |||
const char *get_script_string(ucd::script s) | |||
{ | |||
using namespace ucd; | |||
switch (s) | |||
{ | |||
case Arab: return "Arab"; | |||
case Armi: return "Armi"; | |||
case Armn: return "Armn"; | |||
case Avst: return "Avst"; | |||
case Bali: return "Bali"; | |||
case Bamu: return "Bamu"; | |||
case Batk: return "Batk"; | |||
case Beng: return "Beng"; | |||
case Bopo: return "Bopo"; | |||
case Brah: return "Brah"; | |||
case Brai: return "Brai"; | |||
case Bugi: return "Bugi"; | |||
case Buhd: return "Buhd"; | |||
case Cans: return "Cans"; | |||
case Cari: return "Cari"; | |||
case Cakm: return "Cakm"; | |||
case Cham: return "Cham"; | |||
case Cher: return "Cher"; | |||
case Copt: return "Copt"; | |||
case Cprt: return "Cprt"; | |||
case Cyrl: return "Cyrl"; | |||
case Deva: return "Deva"; | |||
case Dsrt: return "Dsrt"; | |||
case Egyp: return "Egyp"; | |||
case Ethi: return "Ethi"; | |||
case Geor: return "Geor"; | |||
case Glag: return "Glag"; | |||
case Goth: return "Goth"; | |||
case Grek: return "Grek"; | |||
case Gujr: return "Gujr"; | |||
case Guru: return "Guru"; | |||
case Hang: return "Hang"; | |||
case Hano: return "Hano"; | |||
case Hant: return "Hant"; | |||
case Hebr: return "Hebr"; | |||
case Hira: return "Hira"; | |||
case Ital: return "Ital"; | |||
case Java: return "Java"; | |||
case Kali: return "Kali"; | |||
case Kana: return "Kana"; | |||
case Khar: return "Khar"; | |||
case Khmr: return "Khmr"; | |||
case Knda: return "Knda"; | |||
case Kthi: return "Kthi"; | |||
case Lana: return "Lana"; | |||
case Laoo: return "Laoo"; | |||
case Latn: return "Latn"; | |||
case Lepc: return "Lepc"; | |||
case Limb: return "Limb"; | |||
case Linb: return "Linb"; | |||
case Lisu: return "Lisu"; | |||
case Lyci: return "Lyci"; | |||
case Lydi: return "Lydi"; | |||
case Mand: return "Mand"; | |||
case Merc: return "Merc"; | |||
case Mero: return "Mero"; | |||
case Mlym: return "Mlym"; | |||
case Mong: return "Mong"; | |||
case Mtei: return "Mtei"; | |||
case Mymr: return "Mymr"; | |||
case Nkoo: return "Nkoo"; | |||
case Ogam: return "Ogam"; | |||
case Olck: return "Olck"; | |||
case Orkh: return "Orkh"; | |||
case Orya: return "Orya"; | |||
case Osma: return "Osma"; | |||
case Phag: return "Phag"; | |||
case Phli: return "Phli"; | |||
case Phnx: return "Phnx"; | |||
case Plrd: return "Plrd"; | |||
case Prti: return "Prti"; | |||
case Rjng: return "Rjng"; | |||
case Runr: return "Runr"; | |||
case Samr: return "Samr"; | |||
case Sarb: return "Sarb"; | |||
case Saur: return "Saur"; | |||
case Shaw: return "Shaw"; | |||
case Shrd: return "Shrd"; | |||
case Sinh: return "Sinh"; | |||
case Sora: return "Sora"; | |||
case Sund: return "Sund"; | |||
case Sylo: return "Sylo"; | |||
case Syrn: return "Syrn"; | |||
case Tagb: return "Tagb"; | |||
case Takr: return "Takr"; | |||
case Tale: return "Tale"; | |||
case Talu: return "Talu"; | |||
case Taml: return "Taml"; | |||
case Tavt: return "Tavt"; | |||
case Telu: return "Telu"; | |||
case Tfng: return "Tfng"; | |||
case Tglg: return "Tglg"; | |||
case Thaa: return "Thaa"; | |||
case Thai: return "Thai"; | |||
case Tibt: return "Tibt"; | |||
case Ugar: return "Ugar"; | |||
case Vaii: return "Vaii"; | |||
case Xpeo: return "Xpeo"; | |||
case Xsux: return "Xsux"; | |||
case Yiii: return "Yiii"; | |||
case Zyyy: return "Zyyy"; | |||
case Zzzz: return "Zzzz"; | |||
default: return "----"; | |||
} | |||
} | |||
int main() | |||
{ | |||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
{ | |||
const char *script = get_script_string(ucd::lookup_script(c)); | |||
const char *category = get_category_string(ucd::lookup_category(c)); | |||
const char *category_group = get_category_group_string(ucd::lookup_category_group(c)); | |||
ucd::codepoint_t upper = ucd::toupper(c); | |||
ucd::codepoint_t lower = ucd::tolower(c); | |||
ucd::codepoint_t title = ucd::totitle(c); | |||
const char *whitespace = ucd::isspace(c) ? "White_Space" : ""; | |||
printf("%06X %s %s %06X %06X %06X %s\n", | |||
c, category_group, category, | |||
printf("%06X %s %s %s %06X %06X %06X %s\n", | |||
c, script, category_group, category, | |||
upper, lower, title, | |||
whitespace); | |||
} |
@@ -0,0 +1,102 @@ | |||
#!/usr/bin/python | |||
# Copyright (C) 2012 Reece H. Dunn | |||
# | |||
# This file is part of ucd-tools. | |||
# | |||
# ucd-tools is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 3 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# ucd-tools is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# You should have received a copy of the GNU General Public License | |||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
import os | |||
def read_data(path, split_char=':'): | |||
with open(path) as f: | |||
for line in f: | |||
line = line.replace('\n', '') | |||
if not line.startswith('#'): | |||
yield line.split(split_char) | |||
def fold_lines(path): | |||
next_line = None | |||
with open(path) as f: | |||
for line in f: | |||
line = line.replace('\n', '') | |||
if line.startswith(' '): | |||
next_line = '%s%s' % (next_line, line[1:]) | |||
continue | |||
if next_line: | |||
yield next_line | |||
next_line = line | |||
def iana_subtag_entries(path): | |||
tag = {} | |||
for line in fold_lines(path): | |||
if line == '%%': | |||
if 'Type' in tag: | |||
yield tag | |||
tag = {} | |||
continue | |||
packed = line.split(': ') | |||
key = packed[0] | |||
value = ': '.join(packed[1:]) | |||
if key == 'Description': | |||
# Only select the first Description. This handles subtag codes | |||
# that have multiple descriptions (e.g. 'es' maps to "Spanish" | |||
# and "Castilian"). | |||
if not key in tag.keys(): | |||
tag[key] = value | |||
else: | |||
tag[key] = value | |||
yield tag | |||
typemap = { | |||
'extlang': 'ExtLang', | |||
'grandfathered': 'Grandfathered', | |||
'language': 'Language', | |||
'redundant': 'Redundant', | |||
'region': 'Region', | |||
'script': 'Script', | |||
'variant': 'Variant', | |||
} | |||
scopemap = { | |||
'collection': 'Collection', | |||
'macrolanguage': 'MacroLanguage', | |||
'special': 'Special', | |||
'private-use': 'PrivateUse', | |||
} | |||
def read_iana_subtags(path): | |||
tags = {} | |||
for tag in iana_subtag_entries(path): | |||
if 'Subtag' in tag.keys(): | |||
ref = tag['Subtag'] | |||
del tag['Subtag'] | |||
else: | |||
ref = tag['Tag'] | |||
del tag['Tag'] | |||
if 'Scope' in tag.keys(): | |||
if tag['Type'] != 'language': | |||
raise Exception('"Scope" property unexpected for Type="%s"' % tag['Type']) | |||
tag['Type'] = scopemap[ tag['Scope'] ] | |||
del tag['Scope'] | |||
else: | |||
tag['Type'] = typemap[ tag['Type'] ] | |||
if '..' not in ref: # exclude private use definitions | |||
tags[ref] = tag | |||
return tags |
@@ -32,21 +32,29 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||
if data['Property'] in ['White_Space']: | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint]['Properties'].append(data['Property']) | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint]['Script'] = data['Script'] | |||
null = ucd.CodePoint('0000') | |||
if __name__ == '__main__': | |||
for codepoint in ucd.CodeRange('000000..10FFFF'): | |||
try: | |||
data = unicode_chars[codepoint] | |||
title = data['TitleCase'] | |||
upper = data['UpperCase'] | |||
lower = data['LowerCase'] | |||
if title == null: title = codepoint | |||
if upper == null: upper = codepoint | |||
if lower == null: lower = codepoint | |||
print '%s %s %s %s %s %s %s' % ( | |||
codepoint, data['GeneralCategory'][0], data['GeneralCategory'], | |||
upper, lower, title, | |||
' '.join(data['Properties'])) | |||
except KeyError: | |||
print '%s C Cn %s %s %s ' % (codepoint, codepoint, codepoint, codepoint) | |||
data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []} | |||
try: | |||
script = data['Script'] | |||
except KeyError: | |||
script = 'Zzzz' | |||
title = data['TitleCase'] | |||
upper = data['UpperCase'] | |||
lower = data['LowerCase'] | |||
if title == null: title = codepoint | |||
if upper == null: upper = codepoint | |||
if lower == null: lower = codepoint | |||
print '%s %s %s %s %s %s %s %s' % ( | |||
codepoint, script, | |||
data['GeneralCategory'][0], data['GeneralCategory'], | |||
upper, lower, title, | |||
' '.join(data['Properties'])) |
@@ -0,0 +1,172 @@ | |||
#!/usr/bin/python | |||
# Copyright (C) 2012 Reece H. Dunn | |||
# | |||
# This file is part of ucd-tools. | |||
# | |||
# ucd-tools is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 3 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# ucd-tools is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# You should have received a copy of the GNU General Public License | |||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
import os | |||
import sys | |||
import ucd | |||
ucd_rootdir = sys.argv[1] | |||
ucd_version = sys.argv[2] | |||
unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint] = data['Script'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that | |||
# need to be generated. | |||
script_sets = [ | |||
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'), | |||
(ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'), | |||
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), | |||
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), | |||
(ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'), | |||
(ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'), | |||
(ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'), | |||
(ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'), | |||
(ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'), | |||
] | |||
# These scripts have many pages consisting of just this script: | |||
special_scripts = [] | |||
script_tables = {} | |||
for codepoints, script, comment in script_sets: | |||
if not script: | |||
table = {} | |||
table_entry = None | |||
table_codepoint = None | |||
table_script = None | |||
for i, codepoint in enumerate(codepoints): | |||
try: | |||
script = unicode_chars[codepoint] | |||
except KeyError: | |||
script = 'Zzzz' # Unknown | |||
if (i % 256) == 0: | |||
if table_entry: | |||
if table_script in special_scripts: | |||
table[table_codepoint] = table_script | |||
elif table_script: | |||
special_scripts.append(table_script) | |||
table[table_codepoint] = table_script | |||
else: | |||
table[table_codepoint] = table_entry | |||
table_entry = [] | |||
table_codepoint = codepoint | |||
table_script = script | |||
if script != table_script: | |||
table_script = None | |||
table_entry.append(script) | |||
if table_entry: | |||
if table_script in special_scripts: | |||
table[table_codepoint] = table_script | |||
else: | |||
table[table_codepoint] = table_entry | |||
script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table | |||
if __name__ == '__main__': | |||
sys.stdout.write("""/* Unicode Scripts | |||
* | |||
* Copyright (C) 2012 Reece H. Dunn | |||
* | |||
* This file is part of ucd-tools. | |||
* | |||
* ucd-tools is free software: you can redistribute it and/or modify | |||
* it under the terms of the GNU General Public License as published by | |||
* the Free Software Foundation, either version 3 of the License, or | |||
* (at your option) any later version. | |||
* | |||
* ucd-tools is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
* GNU General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU General Public License | |||
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
*/ | |||
// NOTE: This file is automatically generated from the Scripts.txt file in | |||
// the Unicode Character database by the ucd-tools/tools/scripts.py script. | |||
#include "ucd/ucd.h" | |||
#include <stddef.h> | |||
using namespace ucd; | |||
// Unicode Character Data %s | |||
""" % ucd_version) | |||
for script in special_scripts: | |||
sys.stdout.write('\n') | |||
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) | |||
sys.stdout.write('{') | |||
for i in range(0, 256): | |||
if (i % 16) == 0: | |||
sys.stdout.write('\n\t/* %02X */' % i) | |||
sys.stdout.write(' %s,' % script) | |||
sys.stdout.write('\n};\n') | |||
for codepoints, script, comment in script_sets: | |||
if not script: | |||
tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] | |||
for codepoint in sorted(tables.keys()): | |||
table = tables[codepoint] | |||
if table in special_scripts: | |||
continue | |||
sys.stdout.write('\n') | |||
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) | |||
sys.stdout.write('{') | |||
for i, script in enumerate(table): | |||
if (i % 16) == 0: | |||
sys.stdout.write('\n\t/* %02X */' % i) | |||
sys.stdout.write(' %s,' % script) | |||
sys.stdout.write('\n};\n') | |||
for codepoints, script, comment in script_sets: | |||
if not script: | |||
table_index = '%s_%s' % (codepoints.first, codepoints.last) | |||
sys.stdout.write('\n') | |||
sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) | |||
sys.stdout.write('{\n') | |||
for codepoint, table in sorted(script_tables[table_index].items()): | |||
if isinstance(table, str): | |||
sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint)) | |||
else: | |||
sys.stdout.write('\tscripts_%s,\n' % codepoint) | |||
sys.stdout.write('};\n') | |||
sys.stdout.write('\n') | |||
sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n') | |||
sys.stdout.write('{\n') | |||
for codepoints, script, comment in script_sets: | |||
if script: | |||
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment)) | |||
else: | |||
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) | |||
sys.stdout.write('\t{\n') | |||
sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) | |||
sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n') | |||
sys.stdout.write('\t}\n') | |||
sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n') | |||
sys.stdout.write('}\n') |
@@ -19,6 +19,30 @@ | |||
import os | |||
import sys | |||
import iana | |||
script_map = { | |||
# UCD script names not derivable from IANA script tags: | |||
'Canadian_Aboriginal': 'Cans', | |||
'Common': 'Zyyy', | |||
'Egyptian_Hieroglyphs': 'Egyp', | |||
'Inherited': 'Zyyy', | |||
'Meetei_Mayek': 'Mtei', | |||
'Nko': 'Nkoo', | |||
'Phags_Pa': 'Phag', | |||
# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA: | |||
'Cuneiform': 'Xsux', | |||
} | |||
for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items(): | |||
if tag['Type'] == 'Script': | |||
# Convert the IANA scipt tag descriptions to the UCD script names: | |||
desc = tag['Description'] | |||
if ' (' in desc: | |||
desc = desc.split(' (')[0] | |||
desc = desc.replace(' ', '_') | |||
script_map[desc] = ref | |||
# Fix up incorrectly mapped script names: | |||
script_map['Cyrillic'] = 'Cyrl' | |||
class CodePoint: | |||
def __init__(self, x): | |||
@@ -86,6 +110,9 @@ def boolean(x): | |||
return True | |||
return False | |||
def script(x): | |||
return script_map[x] | |||
data_items = { | |||
'Blocks': [ | |||
('Range', codepoint), | |||
@@ -101,7 +128,7 @@ data_items = { | |||
], | |||
'Scripts': [ | |||
('Range', codepoint), | |||
('Script', str), | |||
('Script', script), | |||
], | |||
'UnicodeData': [ | |||
('CodePoint', codepoint), |