Browse Source

tools/categories.py: generate category lookup tables for the full unicode range.

master
Reece H. Dunn 12 years ago
parent
commit
12bafa6b4d
4 changed files with 119 additions and 7 deletions
  1. 1
    1
      Makefile.am
  2. 1
    0
      src/include/ucd/ucd.h
  3. 95
    3
      tools/categories.py
  4. 22
    3
      tools/ucd.py

+ 1
- 1
Makefile.am View File

@@ -54,7 +54,7 @@ EXTRA_DIST += ChangeLog

############################# Unicode Character Database ######################

src/categories.cpp: tools/categories.py
src/categories.cpp: tools/categories.py tools/ucd.py
tools/categories.py ${UCD_ROOTDIR} > $@

############################# libucd ##########################################

+ 1
- 0
src/include/ucd/ucd.h View File

@@ -40,6 +40,7 @@ namespace ucd

Cc, /**< @brief Control Character */
Cf, /**< @brief Format Control Character */
Ci, /**< @brief Invalid Unicode Character */
Cn, /**< @brief Unassigned */
Co, /**< @brief Private Use */
Cs, /**< @brief Surrogate Code Point */

+ 95
- 3
tools/categories.py View File

@@ -21,8 +21,59 @@ import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
unicode_data = ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData')
ucd_rootdir = sys.argv[1]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
if isinstance(data['CodePoint'], ucd.CodePoint):
unicode_chars[data['CodePoint']] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
category_sets = [
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
(ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'),
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'),
(ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
(ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
(ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
(ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
(ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
]

category_tables = {}
for codepoints, category, comment in category_sets:
if not category:
table = {}
table_entry = None
table_codepoint = None
is_unassigned = True
for i, codepoint in enumerate(codepoints):
if (i % 256) == 0:
if table_entry:
if is_unassigned:
table[table_codepoint] = None
else:
table[table_codepoint] = table_entry
table_entry = []
table_codepoint = codepoint
is_unassigned = True
try:
category = unicode_chars[codepoint]
is_unassigned = False
except KeyError:
category = 'Cn' # Unassigned
table_entry.append(category)
if table_entry:
if is_unassigned:
table[table_codepoint] = None
else:
table[table_codepoint] = table_entry
category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table

if __name__ == '__main__':
sys.stdout.write("""/* Unicode General Categories
@@ -50,11 +101,52 @@ if __name__ == '__main__':

#include "ucd/ucd.h"

#include <stddef.h>

using namespace ucd;
""")

for codepoints, category, comment in category_sets:
if not category:
tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
for codepoint in sorted(tables.keys()):
table = tables[codepoint]
if not table:
continue

sys.stdout.write('\n')
sys.stdout.write('static const ucd::category categories_%s[256] =\n' % codepoint)
sys.stdout.write('{')
for i, category in enumerate(table):
if (i % 16) == 0:
sys.stdout.write('\n\t/* %02X */' % i)
sys.stdout.write(' %s,' % category)
sys.stdout.write('\n};\n')

for codepoints, category, comment in category_sets:
if not category:
table_index = '%s_%s' % (codepoints.first, codepoints.last)
sys.stdout.write('\n')
sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index)
sys.stdout.write('{\n')
for codepoint, table in sorted(category_tables[table_index].items()):
if table:
sys.stdout.write('\tcategories_%s,\n' % codepoint)
else:
sys.stdout.write('\tNULL, // %s : Unassigned\n' % codepoint)
sys.stdout.write('};\n')

sys.stdout.write('\n')
sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n')
sys.stdout.write('{\n')
sys.stdout.write('\treturn Cn;\n')
for codepoints, category, comment in category_sets:
if category:
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
else:
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn table ? table[c % 256] : Cn;\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ci;\n')
sys.stdout.write('}\n')

+ 22
- 3
tools/ucd.py View File

@@ -22,13 +22,28 @@ import sys

class CodePoint:
def __init__(self, x):
self.codepoint = int(x, 16)
if isinstance(x, str):
self.codepoint = int(x, 16)
else:
self.codepoint = x

def __repr__(self):
return '%04X' % self.codepoint
return '%06X' % self.codepoint

def __str__(self):
return '%04X' % self.codepoint
return '%06X' % self.codepoint

def __hash__(self):
return self.codepoint

def __eq__(self, other):
return self.codepoint == other.codepoint

def __ne__(self, other):
return self.codepoint != other.codepoint

def __lt__(self, other):
return self.codepoint < other.codepoint

class CodeRange:
def __init__(self, x):
@@ -42,6 +57,10 @@ class CodeRange:
def __str__(self):
return '%s..%s' % (self.first, self.last)

def __iter__(self):
for c in range(self.first.codepoint, self.last.codepoint + 1):
yield CodePoint(c)

def size(self):
return self.last.codepoint - self.first.codepoint + 1


Loading…
Cancel
Save