Browse Source

Avoid duplicating Lo only tables.

master
Reece H. Dunn 12 years ago
parent
commit
7f1dd9cc96
2 changed files with 1423 additions and 8367 deletions
  1. 1391
    8351
      src/categories.cpp
  2. 32
    16
      tools/categories.py

+ 1391
- 8351
src/categories.cpp
File diff suppressed because it is too large
View File


+ 32
- 16
tools/categories.py View File

(ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'), (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
] ]


# These categories have many pages consisting of just this category:
# Cn -- Unassigned
# Lo -- CJK Ideographs
special_categories = ['Cn', 'Lo']

category_tables = {} category_tables = {}
for codepoints, category, comment in category_sets: for codepoints, category, comment in category_sets:
if not category: if not category:
table = {} table = {}
table_entry = None table_entry = None
table_codepoint = None table_codepoint = None
is_unassigned = True
table_category = None
for i, codepoint in enumerate(codepoints): for i, codepoint in enumerate(codepoints):
try:
category = unicode_chars[codepoint]
except KeyError:
category = 'Cn' # Unassigned
if (i % 256) == 0: if (i % 256) == 0:
if table_entry: if table_entry:
if is_unassigned:
table[table_codepoint] = None
if table_category in special_categories:
table[table_codepoint] = table_category
else: else:
table[table_codepoint] = table_entry table[table_codepoint] = table_entry
table_entry = [] table_entry = []
table_codepoint = codepoint table_codepoint = codepoint
is_unassigned = True
try:
category = unicode_chars[codepoint]
is_unassigned = False
except KeyError:
category = 'Cn' # Unassigned
table_category = category
if category != table_category:
table_category = None
table_entry.append(category) table_entry.append(category)
if table_entry: if table_entry:
if is_unassigned:
table[table_codepoint] = None
if table_category in special_categories:
table[table_codepoint] = table_category
else: else:
table[table_codepoint] = table_entry table[table_codepoint] = table_entry
category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
// Unicode Character Data %s // Unicode Character Data %s
""" % ucd_version) """ % ucd_version)


for category in special_categories:
sys.stdout.write('\n')
sys.stdout.write('static const ucd::category categories_%s[256] =\n' % category)
sys.stdout.write('{')
for i in range(0, 256):
if (i % 16) == 0:
sys.stdout.write('\n\t/* %02X */' % i)
sys.stdout.write(' %s,' % category)
sys.stdout.write('\n};\n')

for codepoints, category, comment in category_sets: for codepoints, category, comment in category_sets:
if not category: if not category:
tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)] tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
for codepoint in sorted(tables.keys()): for codepoint in sorted(tables.keys()):
table = tables[codepoint] table = tables[codepoint]
if not table:
if table in special_categories:
continue continue


sys.stdout.write('\n') sys.stdout.write('\n')
sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index) sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index)
sys.stdout.write('{\n') sys.stdout.write('{\n')
for codepoint, table in sorted(category_tables[table_index].items()): for codepoint, table in sorted(category_tables[table_index].items()):
if table:
sys.stdout.write('\tcategories_%s,\n' % codepoint)
if isinstance(table, str):
sys.stdout.write('\tcategories_%s, // %s\n' % (table, codepoint))
else: else:
sys.stdout.write('\tNULL, // %s : Unassigned\n' % codepoint)
sys.stdout.write('\tcategories_%s,\n' % codepoint)
sys.stdout.write('};\n') sys.stdout.write('};\n')


sys.stdout.write('\n') sys.stdout.write('\n')
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints)) sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n') sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first)) sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn table ? table[c % 256] : Cn;\n')
sys.stdout.write('\t\treturn table[c % 256];\n')
sys.stdout.write('\t}\n') sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ii; // Invalid Unicode Codepoint\n') sys.stdout.write('\treturn Ii; // Invalid Unicode Codepoint\n')
sys.stdout.write('}\n') sys.stdout.write('}\n')

Loading…
Cancel
Save