Browse Source

tools/categories.py: generate category lookup tables for the full unicode range.

master
Reece H. Dunn 12 years ago
parent
commit
12bafa6b4d
4 changed files with 119 additions and 7 deletions
  1. 1
    1
      Makefile.am
  2. 1
    0
      src/include/ucd/ucd.h
  3. 95
    3
      tools/categories.py
  4. 22
    3
      tools/ucd.py

+ 1
- 1
Makefile.am View File



############################# Unicode Character Database ###################### ############################# Unicode Character Database ######################


src/categories.cpp: tools/categories.py
src/categories.cpp: tools/categories.py tools/ucd.py
tools/categories.py ${UCD_ROOTDIR} > $@ tools/categories.py ${UCD_ROOTDIR} > $@


############################# libucd ########################################## ############################# libucd ##########################################

+ 1
- 0
src/include/ucd/ucd.h View File



Cc, /**< @brief Control Character */ Cc, /**< @brief Control Character */
Cf, /**< @brief Format Control Character */ Cf, /**< @brief Format Control Character */
Ci, /**< @brief Invalid Unicode Character */
Cn, /**< @brief Unassigned */ Cn, /**< @brief Unassigned */
Co, /**< @brief Private Use */ Co, /**< @brief Private Use */
Cs, /**< @brief Surrogate Code Point */ Cs, /**< @brief Surrogate Code Point */

+ 95
- 3
tools/categories.py View File

import sys import sys
import ucd import ucd


ucd_rootdir = sys.argv[1]
unicode_data = ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData')
ucd_rootdir = sys.argv[1]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
if isinstance(data['CodePoint'], ucd.CodePoint):
unicode_chars[data['CodePoint']] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
category_sets = [
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
(ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'),
(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'),
(ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
(ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
(ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
(ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
(ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
]

category_tables = {}
for codepoints, category, comment in category_sets:
if not category:
table = {}
table_entry = None
table_codepoint = None
is_unassigned = True
for i, codepoint in enumerate(codepoints):
if (i % 256) == 0:
if table_entry:
if is_unassigned:
table[table_codepoint] = None
else:
table[table_codepoint] = table_entry
table_entry = []
table_codepoint = codepoint
is_unassigned = True
try:
category = unicode_chars[codepoint]
is_unassigned = False
except KeyError:
category = 'Cn' # Unassigned
table_entry.append(category)
if table_entry:
if is_unassigned:
table[table_codepoint] = None
else:
table[table_codepoint] = table_entry
category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table


if __name__ == '__main__': if __name__ == '__main__':
sys.stdout.write("""/* Unicode General Categories sys.stdout.write("""/* Unicode General Categories


#include "ucd/ucd.h" #include "ucd/ucd.h"


#include <stddef.h>

using namespace ucd; using namespace ucd;
""") """)


for codepoints, category, comment in category_sets:
if not category:
tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
for codepoint in sorted(tables.keys()):
table = tables[codepoint]
if not table:
continue

sys.stdout.write('\n')
sys.stdout.write('static const ucd::category categories_%s[256] =\n' % codepoint)
sys.stdout.write('{')
for i, category in enumerate(table):
if (i % 16) == 0:
sys.stdout.write('\n\t/* %02X */' % i)
sys.stdout.write(' %s,' % category)
sys.stdout.write('\n};\n')

for codepoints, category, comment in category_sets:
if not category:
table_index = '%s_%s' % (codepoints.first, codepoints.last)
sys.stdout.write('\n')
sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index)
sys.stdout.write('{\n')
for codepoint, table in sorted(category_tables[table_index].items()):
if table:
sys.stdout.write('\tcategories_%s,\n' % codepoint)
else:
sys.stdout.write('\tNULL, // %s : Unassigned\n' % codepoint)
sys.stdout.write('};\n')

sys.stdout.write('\n') sys.stdout.write('\n')
sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n') sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n')
sys.stdout.write('{\n') sys.stdout.write('{\n')
sys.stdout.write('\treturn Cn;\n')
for codepoints, category, comment in category_sets:
if category:
sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
else:
sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn table ? table[c % 256] : Cn;\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ci;\n')
sys.stdout.write('}\n') sys.stdout.write('}\n')

+ 22
- 3
tools/ucd.py View File



class CodePoint: class CodePoint:
def __init__(self, x): def __init__(self, x):
self.codepoint = int(x, 16)
if isinstance(x, str):
self.codepoint = int(x, 16)
else:
self.codepoint = x


def __repr__(self): def __repr__(self):
return '%04X' % self.codepoint
return '%06X' % self.codepoint


def __str__(self): def __str__(self):
return '%04X' % self.codepoint
return '%06X' % self.codepoint

def __hash__(self):
return self.codepoint

def __eq__(self, other):
return self.codepoint == other.codepoint

def __ne__(self, other):
return self.codepoint != other.codepoint

def __lt__(self, other):
return self.codepoint < other.codepoint


class CodeRange: class CodeRange:
def __init__(self, x): def __init__(self, x):
def __str__(self): def __str__(self):
return '%s..%s' % (self.first, self.last) return '%s..%s' % (self.first, self.last)


def __iter__(self):
for c in range(self.first.codepoint, self.last.codepoint + 1):
yield CodePoint(c)

def size(self): def size(self):
return self.last.codepoint - self.first.codepoint + 1 return self.last.codepoint - self.first.codepoint + 1



Loading…
Cancel
Save