12 years ago · 12bafa6b4d
--- a/Makefile.am
+++ b/Makefile.am
@@ -54,7 +54,7 @@ EXTRA_DIST += ChangeLog

 ############################# Unicode Character Database ######################

 src/categories.cpp: tools/categories.py
 src/categories.cpp: tools/categories.py tools/ucd.py
 	tools/categories.py ${UCD_ROOTDIR} > $@

 ############################# libucd ##########################################
--- a/src/include/ucd/ucd.h
+++ b/src/include/ucd/ucd.h
@@ -40,6 +40,7 @@ namespace ucd

 		Cc, /**< @brief Control Character */
 		Cf, /**< @brief Format Control Character */
 		Ci, /**< @brief Invalid Unicode Character */
 		Cn, /**< @brief Unassigned */
 		Co, /**< @brief Private Use */
 		Cs, /**< @brief Surrogate Code Point */
--- a/tools/categories.py
+++ b/tools/categories.py
@@ -21,8 +21,59 @@ import os
 import sys
 import ucd

 ucd_rootdir  = sys.argv[1]
 unicode_data = ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData')
 ucd_rootdir = sys.argv[1]

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	if isinstance(data['CodePoint'], ucd.CodePoint):
 		unicode_chars[data['CodePoint']] = data['GeneralCategory']

 # This map is a combination of the information in the UnicodeData and Blocks
 # data files. It is intended to reduce the number of character tables that
 # need to be generated.
 category_sets = [
 	(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
 	(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
 	(ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'),
 	(ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'),
 	(ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
 	(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
 	(ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
 	(ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
 	(ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
 	(ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
 	(ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
 ]

 category_tables = {}
 for codepoints, category, comment in category_sets:
 	if not category:
 		table = {}
 		table_entry = None
 		table_codepoint = None
 		is_unassigned = True
 		for i, codepoint in enumerate(codepoints):
 			if (i % 256) == 0:
 				if table_entry:
 					if is_unassigned:
 						table[table_codepoint] = None
 					else:
 						table[table_codepoint] = table_entry
 				table_entry = []
 				table_codepoint = codepoint
 				is_unassigned = True
 			try:
 				category = unicode_chars[codepoint]
 				is_unassigned = False
 			except KeyError:
 				category = 'Cn' # Unassigned
 			table_entry.append(category)
 		if table_entry:
 			if is_unassigned:
 				table[table_codepoint] = None
 			else:
 				table[table_codepoint] = table_entry
 		category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table

 if __name__ == '__main__':
 	sys.stdout.write("""/* Unicode General Categories
@@ -50,11 +101,52 @@ if __name__ == '__main__':

 #include "ucd/ucd.h"

 #include <stddef.h>

 using namespace ucd;
 """)

 	for codepoints, category, comment in category_sets:
 		if not category:
 			tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
 			for codepoint in sorted(tables.keys()):
 				table = tables[codepoint]
 				if not table:
 					continue

 				sys.stdout.write('\n')
 				sys.stdout.write('static const ucd::category categories_%s[256] =\n' % codepoint)
 				sys.stdout.write('{')
 				for i, category in enumerate(table):
 					if (i % 16) == 0:
 						sys.stdout.write('\n\t/* %02X */' % i)
 					sys.stdout.write(' %s,' % category)
 				sys.stdout.write('\n};\n')

 	for codepoints, category, comment in category_sets:
 		if not category:
 			table_index = '%s_%s' % (codepoints.first, codepoints.last)
 			sys.stdout.write('\n')
 			sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index)
 			sys.stdout.write('{\n')
 			for codepoint, table in sorted(category_tables[table_index].items()):
 				if table:
 					sys.stdout.write('\tcategories_%s,\n' % codepoint)
 				else:
 					sys.stdout.write('\tNULL, // %s : Unassigned\n' % codepoint)
 			sys.stdout.write('};\n')

 	sys.stdout.write('\n')
 	sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n')
 	sys.stdout.write('{\n')
 	sys.stdout.write('\treturn Cn;\n')
 	for codepoints, category, comment in category_sets:
 		if category:
 			sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
 		else:
 			sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
 			sys.stdout.write('\t{\n')
 			sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
 			sys.stdout.write('\t\treturn table ? table[c % 256] : Cn;\n')
 			sys.stdout.write('\t}\n')
 	sys.stdout.write('\treturn Ci;\n')
 	sys.stdout.write('}\n')
--- a/tools/ucd.py
+++ b/tools/ucd.py
@@ -22,13 +22,28 @@ import sys

 class CodePoint:
 	def __init__(self, x):
 		self.codepoint = int(x, 16)
 		if isinstance(x, str):
 			self.codepoint = int(x, 16)
 		else:
 			self.codepoint = x

 	def __repr__(self):
 		return '%04X' % self.codepoint
 		return '%06X' % self.codepoint

 	def __str__(self):
 		return '%04X' % self.codepoint
 		return '%06X' % self.codepoint

 	def __hash__(self):
 		return self.codepoint

 	def __eq__(self, other):
 		return self.codepoint == other.codepoint

 	def __ne__(self, other):
 		return self.codepoint != other.codepoint

 	def __lt__(self, other):
 		return self.codepoint < other.codepoint

 class CodeRange:
 	def __init__(self, x):
@@ -42,6 +57,10 @@ class CodeRange:
 	def __str__(self):
 		return '%s..%s' % (self.first, self.last)

 	def __iter__(self):
 		for c in range(self.first.codepoint, self.last.codepoint + 1):
 			yield CodePoint(c)

 	def size(self):
 		return self.last.codepoint - self.first.codepoint + 1