#!/usr/bin/python # Copyright (C) 2012-2016 Reece H. Dunn # # This file is part of ucd-tools. # # ucd-tools is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # ucd-tools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ucd-tools. If not, see . import os import sys import ucd ucd_rootdir = sys.argv[1] ucd_version = sys.argv[2] unicode_chars = {} for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): for codepoint in data['Range']: unicode_chars[codepoint] = data['Script'] if '--with-csur' in sys.argv: for csur in ['Klingon']: for data in ucd.parse_ucd_data('data/csur', csur): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data['Script'] # This map is a combination of the information in the UnicodeData and Blocks # data files. It is intended to reduce the number of character tables that # need to be generated. script_sets = [ (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), ] # These scripts have many pages consisting of just this script: special_scripts = [] script_tables = {} for codepoints, script, comment in script_sets: if not script: table = {} table_entry = None table_codepoint = None table_script = None for i, codepoint in enumerate(codepoints): try: script = unicode_chars[codepoint] except KeyError: script = 'Zzzz' # Unknown if (i % 256) == 0: if table_entry: if table_script in special_scripts: table[table_codepoint] = table_script elif table_script: special_scripts.append(table_script) table[table_codepoint] = table_script else: table[table_codepoint] = table_entry table_entry = [] table_codepoint = codepoint table_script = script if script != table_script: table_script = None table_entry.append(script) if table_entry: if table_script in special_scripts: table[table_codepoint] = table_script else: table[table_codepoint] = table_entry script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table if __name__ == '__main__': sys.stdout.write("""/* Unicode Scripts * * Copyright (C) 2012-2016 Reece H. Dunn * * This file is part of ucd-tools. * * ucd-tools is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ucd-tools is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ucd-tools. If not, see . */ /* NOTE: This file is automatically generated from the Scripts.txt file in * the Unicode Character database by the ucd-tools/tools/scripts.py script. */ #include "ucd/ucd.h" #include #define Adlm UCD_SCRIPT_Adlm #define Afak UCD_SCRIPT_Afak #define Aghb UCD_SCRIPT_Aghb #define Ahom UCD_SCRIPT_Ahom #define Arab UCD_SCRIPT_Arab #define Armi UCD_SCRIPT_Armi #define Armn UCD_SCRIPT_Armn #define Avst UCD_SCRIPT_Avst #define Bali UCD_SCRIPT_Bali #define Bamu UCD_SCRIPT_Bamu #define Bass UCD_SCRIPT_Bass #define Batk UCD_SCRIPT_Batk #define Beng UCD_SCRIPT_Beng #define Bhks UCD_SCRIPT_Bhks #define Blis UCD_SCRIPT_Blis #define Bopo UCD_SCRIPT_Bopo #define Brah UCD_SCRIPT_Brah #define Brai UCD_SCRIPT_Brai #define Bugi UCD_SCRIPT_Bugi #define Buhd UCD_SCRIPT_Buhd #define Cakm UCD_SCRIPT_Cakm #define Cans UCD_SCRIPT_Cans #define Cari UCD_SCRIPT_Cari #define Cham UCD_SCRIPT_Cham #define Cher UCD_SCRIPT_Cher #define Cirt UCD_SCRIPT_Cirt #define Copt UCD_SCRIPT_Copt #define Cprt UCD_SCRIPT_Cprt #define Cyrl UCD_SCRIPT_Cyrl #define Cyrs UCD_SCRIPT_Cyrs #define Deva UCD_SCRIPT_Deva #define Dsrt UCD_SCRIPT_Dsrt #define Dupl UCD_SCRIPT_Dupl #define Egyd UCD_SCRIPT_Egyd #define Egyh UCD_SCRIPT_Egyh #define Egyp UCD_SCRIPT_Egyp #define Elba UCD_SCRIPT_Elba #define Ethi UCD_SCRIPT_Ethi #define Geok UCD_SCRIPT_Geok #define Geor UCD_SCRIPT_Geor #define Glag UCD_SCRIPT_Glag #define Gonm UCD_SCRIPT_Gonm #define Goth UCD_SCRIPT_Goth #define Gran UCD_SCRIPT_Gran #define Grek UCD_SCRIPT_Grek #define Gujr UCD_SCRIPT_Gujr #define Guru UCD_SCRIPT_Guru #define Hang UCD_SCRIPT_Hang #define Hani UCD_SCRIPT_Hani #define Hano UCD_SCRIPT_Hano #define Hans UCD_SCRIPT_Hans #define Hant UCD_SCRIPT_Hant #define Hatr UCD_SCRIPT_Hatr #define Hebr UCD_SCRIPT_Hebr #define Hira UCD_SCRIPT_Hira #define Hluw UCD_SCRIPT_Hluw #define Hmng UCD_SCRIPT_Hmng #define Hrkt UCD_SCRIPT_Hrkt #define Hung UCD_SCRIPT_Hung #define Inds UCD_SCRIPT_Inds #define Ital UCD_SCRIPT_Ital #define Java UCD_SCRIPT_Java #define Jpan UCD_SCRIPT_Jpan #define Jurc UCD_SCRIPT_Jurc #define Kali UCD_SCRIPT_Kali #define Kana UCD_SCRIPT_Kana #define Khar UCD_SCRIPT_Khar #define Khmr UCD_SCRIPT_Khmr #define Khoj UCD_SCRIPT_Khoj #define Knda UCD_SCRIPT_Knda #define Kore UCD_SCRIPT_Kore #define Kpel UCD_SCRIPT_Kpel #define Kthi UCD_SCRIPT_Kthi #define Lana UCD_SCRIPT_Lana #define Laoo UCD_SCRIPT_Laoo #define Latf UCD_SCRIPT_Latf #define Latg UCD_SCRIPT_Latg #define Latn UCD_SCRIPT_Latn #define Lepc UCD_SCRIPT_Lepc #define Limb UCD_SCRIPT_Limb #define Lina UCD_SCRIPT_Lina #define Linb UCD_SCRIPT_Linb #define Lisu UCD_SCRIPT_Lisu #define Loma UCD_SCRIPT_Loma #define Lyci UCD_SCRIPT_Lyci #define Lydi UCD_SCRIPT_Lydi #define Mahj UCD_SCRIPT_Mahj #define Mand UCD_SCRIPT_Mand #define Mani UCD_SCRIPT_Mani #define Marc UCD_SCRIPT_Marc #define Maya UCD_SCRIPT_Maya #define Mend UCD_SCRIPT_Mend #define Merc UCD_SCRIPT_Merc #define Mero UCD_SCRIPT_Mero #define Mlym UCD_SCRIPT_Mlym #define Modi UCD_SCRIPT_Modi #define Mong UCD_SCRIPT_Mong #define Moon UCD_SCRIPT_Moon #define Mroo UCD_SCRIPT_Mroo #define Mtei UCD_SCRIPT_Mtei #define Mult UCD_SCRIPT_Mult #define Mymr UCD_SCRIPT_Mymr #define Narb UCD_SCRIPT_Narb #define Nbat UCD_SCRIPT_Nbat #define Newa UCD_SCRIPT_Newa #define Nkgb UCD_SCRIPT_Nkgb #define Nkoo UCD_SCRIPT_Nkoo #define Nshu UCD_SCRIPT_Nshu #define Ogam UCD_SCRIPT_Ogam #define Olck UCD_SCRIPT_Olck #define Orkh UCD_SCRIPT_Orkh #define Orya UCD_SCRIPT_Orya #define Osge UCD_SCRIPT_Osge #define Osma UCD_SCRIPT_Osma #define Palm UCD_SCRIPT_Palm #define Pauc UCD_SCRIPT_Pauc #define Perm UCD_SCRIPT_Perm #define Phag UCD_SCRIPT_Phag #define Phli UCD_SCRIPT_Phli #define Phlp UCD_SCRIPT_Phlp #define Phlv UCD_SCRIPT_Phlv #define Phnx UCD_SCRIPT_Phnx #define Plrd UCD_SCRIPT_Plrd #define Prti UCD_SCRIPT_Prti #define Qaak UCD_SCRIPT_Qaak #define Rjng UCD_SCRIPT_Rjng #define Roro UCD_SCRIPT_Roro #define Runr UCD_SCRIPT_Runr #define Samr UCD_SCRIPT_Samr #define Sara UCD_SCRIPT_Sara #define Sarb UCD_SCRIPT_Sarb #define Saur UCD_SCRIPT_Saur #define Sgnw UCD_SCRIPT_Sgnw #define Shaw UCD_SCRIPT_Shaw #define Shrd UCD_SCRIPT_Shrd #define Sidd UCD_SCRIPT_Sidd #define Sind UCD_SCRIPT_Sind #define Sinh UCD_SCRIPT_Sinh #define Sora UCD_SCRIPT_Sora #define Soyo UCD_SCRIPT_Soyo #define Sund UCD_SCRIPT_Sund #define Sylo UCD_SCRIPT_Sylo #define Syrc UCD_SCRIPT_Syrc #define Syre UCD_SCRIPT_Syre #define Syrj UCD_SCRIPT_Syrj #define Syrn UCD_SCRIPT_Syrn #define Tagb UCD_SCRIPT_Tagb #define Takr UCD_SCRIPT_Takr #define Tale UCD_SCRIPT_Tale #define Talu UCD_SCRIPT_Talu #define Taml UCD_SCRIPT_Taml #define Tang UCD_SCRIPT_Tang #define Tavt UCD_SCRIPT_Tavt #define Telu UCD_SCRIPT_Telu #define Teng UCD_SCRIPT_Teng #define Tfng UCD_SCRIPT_Tfng #define Tglg UCD_SCRIPT_Tglg #define Thaa UCD_SCRIPT_Thaa #define Thai UCD_SCRIPT_Thai #define Tibt UCD_SCRIPT_Tibt #define Tirh UCD_SCRIPT_Tirh #define Ugar UCD_SCRIPT_Ugar #define Vaii UCD_SCRIPT_Vaii #define Visp UCD_SCRIPT_Visp #define Wara UCD_SCRIPT_Wara #define Wole UCD_SCRIPT_Wole #define Xpeo UCD_SCRIPT_Xpeo #define Xsux UCD_SCRIPT_Xsux #define Yiii UCD_SCRIPT_Yiii #define Zanb UCD_SCRIPT_Zanb #define Zinh UCD_SCRIPT_Zinh #define Zmth UCD_SCRIPT_Zmth #define Zsym UCD_SCRIPT_Zsym #define Zxxx UCD_SCRIPT_Zxxx #define Zyyy UCD_SCRIPT_Zyyy #define Zzzz UCD_SCRIPT_Zzzz /* Unicode Character Data %s */ """ % ucd_version) for script in special_scripts: sys.stdout.write('\n') sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) sys.stdout.write('{') for i in range(0, 256): if (i % 16) == 0: sys.stdout.write('\n\t/* %02X */' % i) sys.stdout.write(' %s,' % script) sys.stdout.write('\n};\n') for codepoints, script, comment in script_sets: if not script: tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] for codepoint in sorted(tables.keys()): table = tables[codepoint] if table in special_scripts: continue sys.stdout.write('\n') sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) sys.stdout.write('{') for i, script in enumerate(table): if (i % 16) == 0: sys.stdout.write('\n\t/* %02X */' % i) sys.stdout.write(' %s,' % script) sys.stdout.write('\n};\n') for codepoints, script, comment in script_sets: if not script: table_index = '%s_%s' % (codepoints.first, codepoints.last) sys.stdout.write('\n') sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) sys.stdout.write('{\n') for codepoint, table in sorted(script_tables[table_index].items()): if isinstance(table, str): sys.stdout.write('\tscripts_%s, /* %s */\n' % (table, codepoint)) else: sys.stdout.write('\tscripts_%s,\n' % codepoint) sys.stdout.write('};\n') sys.stdout.write('\n') sys.stdout.write('ucd_script ucd_lookup_script(codepoint_t c)\n') sys.stdout.write('{\n') for codepoints, script, comment in script_sets: if script: sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, script, codepoints, comment)) else: sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints)) sys.stdout.write('\t{\n') sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) sys.stdout.write('\t\treturn (ucd_script)table[c % 256];\n') sys.stdout.write('\t}\n') sys.stdout.write('\treturn Zzzz; /* Invalid Unicode Codepoint */\n') sys.stdout.write('}\n')