#!/usr/bin/python # Copyright (C) 2012 Reece H. Dunn # # This file is part of ucd-tools. # # ucd-tools is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # ucd-tools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ucd-tools. If not, see . import os import sys import ucd ucd_rootdir = sys.argv[1] ucd_version = sys.argv[2] unicode_chars = {} for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): for codepoint in data['Range']: unicode_chars[codepoint] = data['Script'] for data in ucd.parse_ucd_data('supplemental', 'Klingon'): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data['Script'] # This map is a combination of the information in the UnicodeData and Blocks # data files. It is intended to reduce the number of character tables that # need to be generated. script_sets = [ (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'), (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'), (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'), (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'), (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'), (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'), ] # These scripts have many pages consisting of just this script: special_scripts = [] script_tables = {} for codepoints, script, comment in script_sets: if not script: table = {} table_entry = None table_codepoint = None table_script = None for i, codepoint in enumerate(codepoints): try: script = unicode_chars[codepoint] except KeyError: script = 'Zzzz' # Unknown if (i % 256) == 0: if table_entry: if table_script in special_scripts: table[table_codepoint] = table_script elif table_script: special_scripts.append(table_script) table[table_codepoint] = table_script else: table[table_codepoint] = table_entry table_entry = [] table_codepoint = codepoint table_script = script if script != table_script: table_script = None table_entry.append(script) if table_entry: if table_script in special_scripts: table[table_codepoint] = table_script else: table[table_codepoint] = table_entry script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table if __name__ == '__main__': sys.stdout.write("""/* Unicode Scripts * * Copyright (C) 2012 Reece H. Dunn * * This file is part of ucd-tools. * * ucd-tools is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ucd-tools is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ucd-tools. If not, see . */ // NOTE: This file is automatically generated from the Scripts.txt file in // the Unicode Character database by the ucd-tools/tools/scripts.py script. #include "ucd/ucd.h" #include using namespace ucd; // Unicode Character Data %s """ % ucd_version) for script in special_scripts: sys.stdout.write('\n') sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script) sys.stdout.write('{') for i in range(0, 256): if (i % 16) == 0: sys.stdout.write('\n\t/* %02X */' % i) sys.stdout.write(' %s,' % script) sys.stdout.write('\n};\n') for codepoints, script, comment in script_sets: if not script: tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)] for codepoint in sorted(tables.keys()): table = tables[codepoint] if table in special_scripts: continue sys.stdout.write('\n') sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint) sys.stdout.write('{') for i, script in enumerate(table): if (i % 16) == 0: sys.stdout.write('\n\t/* %02X */' % i) sys.stdout.write(' %s,' % script) sys.stdout.write('\n};\n') for codepoints, script, comment in script_sets: if not script: table_index = '%s_%s' % (codepoints.first, codepoints.last) sys.stdout.write('\n') sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index) sys.stdout.write('{\n') for codepoint, table in sorted(script_tables[table_index].items()): if isinstance(table, str): sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint)) else: sys.stdout.write('\tscripts_%s,\n' % codepoint) sys.stdout.write('};\n') sys.stdout.write('\n') sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n') sys.stdout.write('{\n') for codepoints, script, comment in script_sets: if script: sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment)) else: sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) sys.stdout.write('\t{\n') sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n') sys.stdout.write('\t}\n') sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n') sys.stdout.write('}\n')