12 years ago · 65f95033c8
--- a/Makefile.am
+++ b/Makefile.am
@@ -57,22 +57,37 @@ EXTRA_DIST += ChangeLog
 UCD_VERSION=6.2.0
 UCD_ROOTDIR=data/ucd

 data/language-subtag-registry:
 	mkdir -pv data
 	wget -O $@ http://www.iana.org/assignments/language-subtag-registry

 data/ucd/PropList.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt

 data/ucd/Scripts.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt

 data/ucd/UnicodeData.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/UnicodeData.txt

 ############################# libucd ##########################################

 src/case.cpp: tools/case.py tools/ucd.py data/ucd/UnicodeData.txt
 src/case.cpp: tools/case.py tools/ucd.py \
 	data/ucd/UnicodeData.txt
 	tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@

 src/categories.cpp: tools/categories.py tools/ucd.py data/ucd/UnicodeData.txt
 src/categories.cpp: tools/categories.py tools/ucd.py \
 	data/ucd/UnicodeData.txt
 	tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@

 src/scripts.cpp: tools/scripts.py tools/ucd.py \
 	data/language-subtag-registry \
 	data/ucd/Scripts.txt
 	tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > $@

 libucd_includedir = $(includedir)/ucd
 libucd_include_HEADERS = \
 	src/include/ucd/ucd.h
@@ -83,7 +98,8 @@ src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS}
 src_libucd_la_SOURCES  = \
 	src/case.cpp \
 	src/categories.cpp \
 	src/ctype.cpp
 	src/ctype.cpp \
 	src/scripts.cpp

 ############################# tests ###########################################

--- a/src/include/ucd/ucd.h
+++ b/src/include/ucd/ucd.h
@@ -118,6 +118,131 @@ namespace ucd
 	category lookup_category(codepoint_t c);


 	//@}
 	/** @name  Unicode Script
 	  * @brief These functions query the Script property of Unicode codepoints.
 	  */
 	//@{


 	/** @brief Unicode Script
 	  * @see   http://www.iana.org/assignments/language-subtag-registry
 	  * @see   http://www.unicode.org/iso15924/iso15924-codes.html
 	  */
 	enum script
 	{
 		Arab, /**< @brief Arabic Script */
 		Armi, /**< @brief Imperial Aramaic Script */
 		Armn, /**< @brief Armenian Script */
 		Avst, /**< @brief Avestan Script */
 		Bali, /**< @brief Balinese Script */
 		Bamu, /**< @brief Bamum Script */
 		Batk, /**< @brief Batak Script */
 		Beng, /**< @brief Bengali Script */
 		Bopo, /**< @brief Bopomofo Script */
 		Brah, /**< @brief Brahmi Script */
 		Brai, /**< @brief Braille Script */
 		Bugi, /**< @brief Buginese Script */
 		Buhd, /**< @brief Buhid Script */
 		Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
 		Cari, /**< @brief Carian Script */
 		Cakm, /**< @brief Chakma Script */
 		Cham, /**< @brief Cham Script */
 		Cher, /**< @brief Cherokee Script */
 		Copt, /**< @brief Coptic Script */
 		Cprt, /**< @brief Cypriot Script */
 		Cyrl, /**< @brief Cyrillic Script */
 		Deva, /**< @brief Devanagari Script */
 		Dsrt, /**< @brief Deseret Script */
 		Egyp, /**< @brief Egyptian Hiegoglyphs */
 		Ethi, /**< @brief Ethiopic Script */
 		Geor, /**< @brief Geirgian Script */
 		Glag, /**< @brief Glagolitic Script */
 		Goth, /**< @brief Gothic Script */
 		Grek, /**< @brief Greek Script */
 		Gujr, /**< @brief Gujarati Script */
 		Guru, /**< @brief Gurmukhi Script */
 		Hang, /**< @brief Hangul Script */
 		Hano, /**< @brief Hanunoo Script */
 		Hant, /**< @brief Han (Traditional) Script */
 		Hebr, /**< @brief Hebrew Script */
 		Hira, /**< @brief Hiragana Script */
 		Ital, /**< @brief Old Italic Script */
 		Java, /**< @brief Javanese Script */
 		Kali, /**< @brief Kayah Li Script */
 		Kana, /**< @brief Katakana Script */
 		Khar, /**< @brief Kharoshthi Script */
 		Khmr, /**< @brief Khmer Script */
 		Knda, /**< @brief Kannada Script */
 		Kthi, /**< @brief Kaithi Script */
 		Lana, /**< @brief Tai Tham Script */
 		Laoo, /**< @brief Lao Script */
 		Latn, /**< @brief Latin Script */
 		Lepc, /**< @brief Lepcha Script */
 		Limb, /**< @brief Limbu Script */
 		Linb, /**< @brief Linear B Script */
 		Lisu, /**< @brief Lisu Script */
 		Lyci, /**< @brief Lycian Script */
 		Lydi, /**< @brief Lydian Script */
 		Mand, /**< @brief Mandaic Script */
 		Merc, /**< @brief Meroitic Cursive Script */
 		Mero, /**< @brief Meroitic Hieroglyphs */
 		Mlym, /**< @brief Malayalam Script */
 		Mong, /**< @brief Mongolian Script */
 		Mtei, /**< @brief Meitei Mayek Script */
 		Mymr, /**< @brief Myanmar Script */
 		Nkoo, /**< @brief N'Ko Script */
 		Ogam, /**< @brief Ogham Script */
 		Olck, /**< @brief Ol Chiki Script */
 		Orkh, /**< @brief Old Turkic Script */
 		Orya, /**< @brief Oriya Script */
 		Osma, /**< @brief Osmanya Script */
 		Phag, /**< @brief Phags-Pa Script */
 		Phli, /**< @brief Inscriptional Pahlavi Script */
 		Phnx, /**< @brief Phoenician Script */
 		Plrd, /**< @brief Miao Script */
 		Prti, /**< @brief Inscriptional Parthian Script */
 		Rjng, /**< @brief Rejang Script */
 		Runr, /**< @brief Runic Script */
 		Samr, /**< @brief Samaritan Script */
 		Sarb, /**< @brief Old South Arabian Script */
 		Saur, /**< @brief Saurashtra Script */
 		Shaw, /**< @brief Shavian Script */
 		Shrd, /**< @brief Sharada Script */
 		Sinh, /**< @brief Sinhala Script */
 		Sora, /**< @brief Sora Sompeng Script */
 		Sund, /**< @brief Sundanese Script */
 		Sylo, /**< @brief Syloti Nagri Script */
 		Syrn, /**< @brief Syriatic (Eastern) Script */
 		Tagb, /**< @brief Tagbanwa Script */
 		Takr, /**< @brief Takri Script */
 		Tale, /**< @brief Tai Le Script */
 		Talu, /**< @brief New Tai Lue Script */
 		Taml, /**< @brief Tamil Script */
 		Tavt, /**< @brief Tai Viet Script */
 		Telu, /**< @brief Telugu Script */
 		Tfng, /**< @brief Tifinagh Script */
 		Tglg, /**< @brief Tagalog Script */
 		Thaa, /**< @brief Thaana Script */
 		Thai, /**< @brief Thai Script */
 		Tibt, /**< @brief Tibetan Script */
 		Ugar, /**< @brief Ugaritic Script */
 		Vaii, /**< @brief Vai Script */
 		Xpeo, /**< @brief Old Persian Script */
 		Xsux, /**< @brief Cuneiform Script */
 		Yiii, /**< @brief Yi Script */
 		Zyyy, /**< @brief Inherited Script */
 		Zzzz, /**< @brief Unknown Script */
 	};

 	/** @brief Lookup the Script for a Unicode codepoint.
 	  *
 	  * @param c The Unicode codepoint to lookup.
 	  * @return  The Script of the Unicode codepoint.
 	  */
 	script lookup_script(codepoint_t c);


 	//@}
 	/** @name  ctype-style APIs
 	  * @brief These functions provide wctype compatible functions using the UCD data.
--- a/src/scripts.cpp
+++ b/src/scripts.cpp
--- a/tests/printucddata.cpp
+++ b/tests/printucddata.cpp
@@ -78,18 +78,130 @@ const char *get_category_string(ucd::category c)
 	}
 }

 const char *get_script_string(ucd::script s)
 {
 	using namespace ucd;
 	switch (s)
 	{
 	case Arab: return "Arab";
 	case Armi: return "Armi";
 	case Armn: return "Armn";
 	case Avst: return "Avst";
 	case Bali: return "Bali";
 	case Bamu: return "Bamu";
 	case Batk: return "Batk";
 	case Beng: return "Beng";
 	case Bopo: return "Bopo";
 	case Brah: return "Brah";
 	case Brai: return "Brai";
 	case Bugi: return "Bugi";
 	case Buhd: return "Buhd";
 	case Cans: return "Cans";
 	case Cari: return "Cari";
 	case Cakm: return "Cakm";
 	case Cham: return "Cham";
 	case Cher: return "Cher";
 	case Copt: return "Copt";
 	case Cprt: return "Cprt";
 	case Cyrl: return "Cyrl";
 	case Deva: return "Deva";
 	case Dsrt: return "Dsrt";
 	case Egyp: return "Egyp";
 	case Ethi: return "Ethi";
 	case Geor: return "Geor";
 	case Glag: return "Glag";
 	case Goth: return "Goth";
 	case Grek: return "Grek";
 	case Gujr: return "Gujr";
 	case Guru: return "Guru";
 	case Hang: return "Hang";
 	case Hano: return "Hano";
 	case Hant: return "Hant";
 	case Hebr: return "Hebr";
 	case Hira: return "Hira";
 	case Ital: return "Ital";
 	case Java: return "Java";
 	case Kali: return "Kali";
 	case Kana: return "Kana";
 	case Khar: return "Khar";
 	case Khmr: return "Khmr";
 	case Knda: return "Knda";
 	case Kthi: return "Kthi";
 	case Lana: return "Lana";
 	case Laoo: return "Laoo";
 	case Latn: return "Latn";
 	case Lepc: return "Lepc";
 	case Limb: return "Limb";
 	case Linb: return "Linb";
 	case Lisu: return "Lisu";
 	case Lyci: return "Lyci";
 	case Lydi: return "Lydi";
 	case Mand: return "Mand";
 	case Merc: return "Merc";
 	case Mero: return "Mero";
 	case Mlym: return "Mlym";
 	case Mong: return "Mong";
 	case Mtei: return "Mtei";
 	case Mymr: return "Mymr";
 	case Nkoo: return "Nkoo";
 	case Ogam: return "Ogam";
 	case Olck: return "Olck";
 	case Orkh: return "Orkh";
 	case Orya: return "Orya";
 	case Osma: return "Osma";
 	case Phag: return "Phag";
 	case Phli: return "Phli";
 	case Phnx: return "Phnx";
 	case Plrd: return "Plrd";
 	case Prti: return "Prti";
 	case Rjng: return "Rjng";
 	case Runr: return "Runr";
 	case Samr: return "Samr";
 	case Sarb: return "Sarb";
 	case Saur: return "Saur";
 	case Shaw: return "Shaw";
 	case Shrd: return "Shrd";
 	case Sinh: return "Sinh";
 	case Sora: return "Sora";
 	case Sund: return "Sund";
 	case Sylo: return "Sylo";
 	case Syrn: return "Syrn";
 	case Tagb: return "Tagb";
 	case Takr: return "Takr";
 	case Tale: return "Tale";
 	case Talu: return "Talu";
 	case Taml: return "Taml";
 	case Tavt: return "Tavt";
 	case Telu: return "Telu";
 	case Tfng: return "Tfng";
 	case Tglg: return "Tglg";
 	case Thaa: return "Thaa";
 	case Thai: return "Thai";
 	case Tibt: return "Tibt";
 	case Ugar: return "Ugar";
 	case Vaii: return "Vaii";
 	case Xpeo: return "Xpeo";
 	case Xsux: return "Xsux";
 	case Yiii: return "Yiii";
 	case Zyyy: return "Zyyy";
 	case Zzzz: return "Zzzz";
 	default:   return "----";
 	}
 }

 int main()
 {
 	for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
 	{
 		const char *script = get_script_string(ucd::lookup_script(c));
 		const char *category = get_category_string(ucd::lookup_category(c));
 		const char *category_group = get_category_group_string(ucd::lookup_category_group(c));
 		ucd::codepoint_t upper = ucd::toupper(c);
 		ucd::codepoint_t lower = ucd::tolower(c);
 		ucd::codepoint_t title = ucd::totitle(c);
 		const char *whitespace = ucd::isspace(c) ? "White_Space" : "";
 		printf("%06X %s %s %06X %06X %06X %s\n",
 		       c, category_group, category,
 		printf("%06X %s %s %s %06X %06X %06X %s\n",
 		       c, script, category_group, category,
 		       upper, lower, title,
 		       whitespace);
 	}
--- a/tools/iana.py
+++ b/tools/iana.py
@@ -0,0 +1,102 @@
 #!/usr/bin/python

 # Copyright (C) 2012 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
 # ucd-tools is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # ucd-tools is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

 import os

 def read_data(path, split_char=':'):
 	with open(path) as f:
 		for line in f:
 			line = line.replace('\n', '')
 			if not line.startswith('#'):
 				yield line.split(split_char)

 def fold_lines(path):
 	next_line = None
 	with open(path) as f:
 		for line in f:
 			line = line.replace('\n', '')
 			if line.startswith(' '):
 				next_line = '%s%s' % (next_line, line[1:])
 				continue
 			if next_line:
 				yield next_line
 			next_line = line

 def iana_subtag_entries(path):
 	tag = {}
 	for line in fold_lines(path):
 		if line == '%%':
 			if 'Type' in tag:
 				yield tag
 			tag = {}
 			continue

 		packed = line.split(': ')
 		key    = packed[0]
 		value  = ': '.join(packed[1:])

 		if key == 'Description':
 			# Only select the first Description. This handles subtag codes
 			# that have multiple descriptions (e.g. 'es' maps to "Spanish"
 			# and "Castilian").
 			if not key in tag.keys():
 				tag[key] = value
 		else:
 			tag[key] = value
 	yield tag

 typemap = {
 	'extlang':       'ExtLang',
 	'grandfathered': 'Grandfathered',
 	'language':      'Language',
 	'redundant':     'Redundant',
 	'region':        'Region',
 	'script':        'Script',
 	'variant':       'Variant',
 }

 scopemap = {
 	'collection':    'Collection',
 	'macrolanguage': 'MacroLanguage',
 	'special':       'Special',
 	'private-use':   'PrivateUse',
 }

 def read_iana_subtags(path):
 	tags = {}
 	for tag in iana_subtag_entries(path):
 		if 'Subtag' in tag.keys():
 			ref = tag['Subtag']
 			del tag['Subtag']
 		else:
 			ref = tag['Tag']
 			del tag['Tag']

 		if 'Scope' in tag.keys():
 			if tag['Type'] != 'language':
 				raise Exception('"Scope" property unexpected for Type="%s"' % tag['Type'])

 			tag['Type'] = scopemap[ tag['Scope'] ]
 			del tag['Scope']
 		else:
 			tag['Type'] = typemap[ tag['Type'] ]

 		if '..' not in ref: # exclude private use definitions
 			tags[ref] = tag
 	return tags
--- a/tools/printdata.py
+++ b/tools/printdata.py
@@ -32,21 +32,29 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
 	if data['Property'] in ['White_Space']:
 		for codepoint in data['Range']:
 			unicode_chars[codepoint]['Properties'].append(data['Property'])
 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
 	for codepoint in data['Range']:
 		unicode_chars[codepoint]['Script'] = data['Script']

 null = ucd.CodePoint('0000')
 if __name__ == '__main__':
 	for codepoint in ucd.CodeRange('000000..10FFFF'):
 		try:
 			data = unicode_chars[codepoint]
 			title = data['TitleCase']
 			upper = data['UpperCase']
 			lower = data['LowerCase']
 			if title == null: title = codepoint
 			if upper == null: upper = codepoint
 			if lower == null: lower = codepoint
 			print '%s %s %s %s %s %s %s' % (
 			      codepoint, data['GeneralCategory'][0], data['GeneralCategory'],
 			      upper, lower, title,
 			      ' '.join(data['Properties']))
 		except KeyError:
 			print '%s C Cn %s %s %s ' % (codepoint, codepoint, codepoint, codepoint)
 			data = {'GeneralCategory': 'Cn', 'TitleCase': codepoint, 'UpperCase': codepoint, 'LowerCase': codepoint, 'Properties': []}
 		try:
 			script = data['Script']
 		except KeyError:
 			script = 'Zzzz'
 		title = data['TitleCase']
 		upper = data['UpperCase']
 		lower = data['LowerCase']
 		if title == null: title = codepoint
 		if upper == null: upper = codepoint
 		if lower == null: lower = codepoint
 		print '%s %s %s %s %s %s %s %s' % (
 		      codepoint, script,
 		      data['GeneralCategory'][0], data['GeneralCategory'],
 		      upper, lower, title,
 		      ' '.join(data['Properties']))
--- a/tools/scripts.py
+++ b/tools/scripts.py
@@ -0,0 +1,172 @@
 #!/usr/bin/python

 # Copyright (C) 2012 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
 # ucd-tools is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # ucd-tools is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

 import os
 import sys
 import ucd

 ucd_rootdir = sys.argv[1]
 ucd_version = sys.argv[2]

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
 	for codepoint in data['Range']:
 		unicode_chars[codepoint] = data['Script']

 # This map is a combination of the information in the UnicodeData and Blocks
 # data files. It is intended to reduce the number of character tables that
 # need to be generated.
 script_sets = [
 	(ucd.CodeRange('000000..00D7FF'), None,   'Multiple Blocks'),
 	(ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'),
 	(ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'),
 	(ucd.CodeRange('00F900..02FAFF'), None,   'Multiple Blocks'),
 	(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
 	(ucd.CodeRange('0E0000..0E01FF'), None,   'Multiple Blocks'),
 	(ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'),
 	(ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'),
 	(ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'),
 	(ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'),
 	(ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'),
 ]

 # These scripts have many pages consisting of just this script:
 special_scripts = []

 script_tables = {}
 for codepoints, script, comment in script_sets:
 	if not script:
 		table = {}
 		table_entry = None
 		table_codepoint = None
 		table_script = None
 		for i, codepoint in enumerate(codepoints):
 			try:
 				script = unicode_chars[codepoint]
 			except KeyError:
 				script = 'Zzzz' # Unknown
 			if (i % 256) == 0:
 				if table_entry:
 					if table_script in special_scripts:
 						table[table_codepoint] = table_script
 					elif table_script:
 						special_scripts.append(table_script)
 						table[table_codepoint] = table_script
 					else:
 						table[table_codepoint] = table_entry
 				table_entry = []
 				table_codepoint = codepoint
 				table_script = script
 			if script != table_script:
 				table_script = None
 			table_entry.append(script)
 		if table_entry:
 			if table_script in special_scripts:
 				table[table_codepoint] = table_script
 			else:
 				table[table_codepoint] = table_entry
 		script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table

 if __name__ == '__main__':
 	sys.stdout.write("""/* Unicode Scripts
 *
 * Copyright (C) 2012 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */

 // NOTE: This file is automatically generated from the Scripts.txt file in
 // the Unicode Character database by the ucd-tools/tools/scripts.py script.

 #include "ucd/ucd.h"

 #include <stddef.h>

 using namespace ucd;

 // Unicode Character Data %s
 """ % ucd_version)

 	for script in special_scripts:
 		sys.stdout.write('\n')
 		sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
 		sys.stdout.write('{')
 		for i in range(0, 256):
 			if (i % 16) == 0:
 				sys.stdout.write('\n\t/* %02X */' % i)
 			sys.stdout.write(' %s,' % script)
 		sys.stdout.write('\n};\n')

 	for codepoints, script, comment in script_sets:
 		if not script:
 			tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
 			for codepoint in sorted(tables.keys()):
 				table = tables[codepoint]
 				if table in special_scripts:
 					continue

 				sys.stdout.write('\n')
 				sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
 				sys.stdout.write('{')
 				for i, script in enumerate(table):
 					if (i % 16) == 0:
 						sys.stdout.write('\n\t/* %02X */' % i)
 					sys.stdout.write(' %s,' % script)
 				sys.stdout.write('\n};\n')

 	for codepoints, script, comment in script_sets:
 		if not script:
 			table_index = '%s_%s' % (codepoints.first, codepoints.last)
 			sys.stdout.write('\n')
 			sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
 			sys.stdout.write('{\n')
 			for codepoint, table in sorted(script_tables[table_index].items()):
 				if isinstance(table, str):
 					sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
 				else:
 					sys.stdout.write('\tscripts_%s,\n' % codepoint)
 			sys.stdout.write('};\n')

 	sys.stdout.write('\n')
 	sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n')
 	sys.stdout.write('{\n')
 	for codepoints, script, comment in script_sets:
 		if script:
 			sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
 		else:
 			sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
 			sys.stdout.write('\t{\n')
 			sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
 			sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n')
 			sys.stdout.write('\t}\n')
 	sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
 	sys.stdout.write('}\n')
--- a/tools/ucd.py
+++ b/tools/ucd.py
@@ -19,6 +19,30 @@

 import os
 import sys
 import iana

 script_map = {
 	# UCD script names not derivable from IANA script tags:
 	'Canadian_Aboriginal': 'Cans',
 	'Common': 'Zyyy',
 	'Egyptian_Hieroglyphs': 'Egyp',
 	'Inherited': 'Zyyy',
 	'Meetei_Mayek': 'Mtei',
 	'Nko': 'Nkoo',
 	'Phags_Pa': 'Phag',
 	# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
 	'Cuneiform': 'Xsux',
 }
 for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
 	if tag['Type'] == 'Script':
 		# Convert the IANA scipt tag descriptions to the UCD script names:
 		desc = tag['Description']
 		if ' (' in desc:
 			desc = desc.split(' (')[0]
 		desc = desc.replace(' ', '_')
 		script_map[desc] = ref
 # Fix up incorrectly mapped script names:
 script_map['Cyrillic'] = 'Cyrl'

 class CodePoint:
 	def __init__(self, x):
@@ -86,6 +110,9 @@ def boolean(x):
 		return True
 	return False

 def script(x):
 	return script_map[x]

 data_items = {
 	'Blocks': [
 		('Range', codepoint),
@@ -101,7 +128,7 @@ data_items = {
 	],
 	'Scripts': [
 		('Range', codepoint),
 		('Script', str),
 		('Script', script),
 	],
 	'UnicodeData': [
 		('CodePoint', codepoint),