| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 | 
import os
import sys
import ucd
ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]
unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
	for codepoint in data['Range']:
		unicode_chars[codepoint] = data['Script']
script_sets = [
	(ucd.CodeRange('000000..00D7FF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'),
	(ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'),
	(ucd.CodeRange('00F900..02FAFF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
	(ucd.CodeRange('0E0000..0E01FF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'),
	(ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'),
	(ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'),
	(ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'),
	(ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'),
]
special_scripts = []
script_tables = {}
for codepoints, script, comment in script_sets:
	if not script:
		table = {}
		table_entry = None
		table_codepoint = None
		table_script = None
		for i, codepoint in enumerate(codepoints):
			try:
				script = unicode_chars[codepoint]
			except KeyError:
				script = 'Zzzz' 
			if (i % 256) == 0:
				if table_entry:
					if table_script in special_scripts:
						table[table_codepoint] = table_script
					elif table_script:
						special_scripts.append(table_script)
						table[table_codepoint] = table_script
					else:
						table[table_codepoint] = table_entry
				table_entry = []
				table_codepoint = codepoint
				table_script = script
			if script != table_script:
				table_script = None
			table_entry.append(script)
		if table_entry:
			if table_script in special_scripts:
				table[table_codepoint] = table_script
			else:
				table[table_codepoint] = table_entry
		script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
if __name__ == '__main__':
	sys.stdout.write("""/* Unicode Scripts
 *
 * Copyright (C) 2012 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */
// NOTE: This file is automatically generated from the Scripts.txt file in
// the Unicode Character database by the ucd-tools/tools/scripts.py script.
#include "ucd/ucd.h"
#include <stddef.h>
using namespace ucd;
// Unicode Character Data %s
""" % ucd_version)
	for script in special_scripts:
		sys.stdout.write('\n')
		sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
		sys.stdout.write('{')
		for i in range(0, 256):
			if (i % 16) == 0:
				sys.stdout.write('\n\t/* %02X */' % i)
			sys.stdout.write(' %s,' % script)
		sys.stdout.write('\n};\n')
	for codepoints, script, comment in script_sets:
		if not script:
			tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
			for codepoint in sorted(tables.keys()):
				table = tables[codepoint]
				if table in special_scripts:
					continue
				sys.stdout.write('\n')
				sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
				sys.stdout.write('{')
				for i, script in enumerate(table):
					if (i % 16) == 0:
						sys.stdout.write('\n\t/* %02X */' % i)
					sys.stdout.write(' %s,' % script)
				sys.stdout.write('\n};\n')
	for codepoints, script, comment in script_sets:
		if not script:
			table_index = '%s_%s' % (codepoints.first, codepoints.last)
			sys.stdout.write('\n')
			sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
			sys.stdout.write('{\n')
			for codepoint, table in sorted(script_tables[table_index].items()):
				if isinstance(table, str):
					sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
				else:
					sys.stdout.write('\tscripts_%s,\n' % codepoint)
			sys.stdout.write('};\n')
	sys.stdout.write('\n')
	sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n')
	sys.stdout.write('{\n')
	for codepoints, script, comment in script_sets:
		if script:
			sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
		else:
			sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
			sys.stdout.write('\t{\n')
			sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
			sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n')
			sys.stdout.write('\t}\n')
	sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
	sys.stdout.write('}\n')
 |