#!/usr/bin/python

# Copyright (C) 2012 Reece H. Dunn
#
# This file is part of ucd-tools.
#
# ucd-tools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ucd-tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
	for codepoint in data['Range']:
		unicode_chars[codepoint] = data['Script']
for data in ucd.parse_ucd_data('supplemental', 'Klingon'):
	for codepoint in data['CodePoint']:
		unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
script_sets = [
	(ucd.CodeRange('000000..00D7FF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
	(ucd.CodeRange('00F800..02FAFF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
	(ucd.CodeRange('0E0000..0E01FF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
]

# These scripts have many pages consisting of just this script:
special_scripts = []

script_tables = {}
for codepoints, script, comment in script_sets:
	if not script:
		table = {}
		table_entry = None
		table_codepoint = None
		table_script = None
		for i, codepoint in enumerate(codepoints):
			try:
				script = unicode_chars[codepoint]
			except KeyError:
				script = 'Zzzz' # Unknown
			if (i % 256) == 0:
				if table_entry:
					if table_script in special_scripts:
						table[table_codepoint] = table_script
					elif table_script:
						special_scripts.append(table_script)
						table[table_codepoint] = table_script
					else:
						table[table_codepoint] = table_entry
				table_entry = []
				table_codepoint = codepoint
				table_script = script
			if script != table_script:
				table_script = None
			table_entry.append(script)
		if table_entry:
			if table_script in special_scripts:
				table[table_codepoint] = table_script
			else:
				table[table_codepoint] = table_entry
		script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table

if __name__ == '__main__':
	sys.stdout.write("""/* Unicode Scripts
 *
 * Copyright (C) 2012 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */

// NOTE: This file is automatically generated from the Scripts.txt file in
// the Unicode Character database by the ucd-tools/tools/scripts.py script.

#include "ucd/ucd.h"

#include <stddef.h>

using namespace ucd;

// Unicode Character Data %s
""" % ucd_version)

	for script in special_scripts:
		sys.stdout.write('\n')
		sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
		sys.stdout.write('{')
		for i in range(0, 256):
			if (i % 16) == 0:
				sys.stdout.write('\n\t/* %02X */' % i)
			sys.stdout.write(' %s,' % script)
		sys.stdout.write('\n};\n')

	for codepoints, script, comment in script_sets:
		if not script:
			tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
			for codepoint in sorted(tables.keys()):
				table = tables[codepoint]
				if table in special_scripts:
					continue

				sys.stdout.write('\n')
				sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
				sys.stdout.write('{')
				for i, script in enumerate(table):
					if (i % 16) == 0:
						sys.stdout.write('\n\t/* %02X */' % i)
					sys.stdout.write(' %s,' % script)
				sys.stdout.write('\n};\n')

	for codepoints, script, comment in script_sets:
		if not script:
			table_index = '%s_%s' % (codepoints.first, codepoints.last)
			sys.stdout.write('\n')
			sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
			sys.stdout.write('{\n')
			for codepoint, table in sorted(script_tables[table_index].items()):
				if isinstance(table, str):
					sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
				else:
					sys.stdout.write('\tscripts_%s,\n' % codepoint)
			sys.stdout.write('};\n')

	sys.stdout.write('\n')
	sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n')
	sys.stdout.write('{\n')
	for codepoints, script, comment in script_sets:
		if script:
			sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
		else:
			sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
			sys.stdout.write('\t{\n')
			sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
			sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n')
			sys.stdout.write('\t}\n')
	sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
	sys.stdout.write('}\n')