The mapping of the script labels in the UCD data to ISO 15924 script tags is now done using the sc property map in the PropertyValueAliases data. This has the following benefits: 1. It removes the dependency on the IANA subtag registry. 2. It ensures the scripts are correct as specified in the UCD data files.

11 years ago · 1154409393
--- a/.gitignore
+++ b/.gitignore
 # build output:
 data/language-subtag-registry
 data/ucd
 src/libucd.la
--- a/Makefile.am
+++ b/Makefile.am
 UCD_VERSION=@UCD_VERSION@
 UCD_ROOTDIR=data/ucd
 data/language-subtag-registry:
 	wget -O $@ http://www.iana.org/assignments/language-subtag-registry
 data/ucd/PropList.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt
 data/ucd/PropertyValueAliases.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropertyValueAliases.txt
 data/ucd/Scripts.txt:
 	mkdir -pv data/ucd
 	wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt
 ############################# libucd ##########################################
 tools/ucd.py: data/language-subtag-registry
 tools/ucd.py: data/ucd/PropertyValueAliases.txt
 tools/case.py: tools/ucd.py \
 	data/ucd/UnicodeData.txt
--- a/src/scripts.cpp
+++ b/src/scripts.cpp
--- a/tools/ucd.py
+++ b/tools/ucd.py
 import sys
 import iana
 script_map = {
 	# UCD script names not derivable from IANA script tags:
 	'Canadian_Aboriginal': 'Cans',
 	'Common': 'Zyyy',
 	'Egyptian_Hieroglyphs': 'Egyp',
 	'Inherited': 'Zyyy',
 	'Meetei_Mayek': 'Mtei',
 	'Nko': 'Nkoo',
 	'Phags_Pa': 'Phag',
 	# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
 	'Cuneiform': 'Xsux',
 	'Duployan': 'Dupl',
 }
 for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
 	if tag['Type'] == 'Script':
 		# Convert the IANA scipt tag descriptions to the UCD script names:
 		desc = tag['Description']
 		if ' (' in desc:
 			desc = desc.split(' (')[0]
 		desc = desc.replace(' ', '_')
 		script_map[desc] = ref
 # Fix up incorrectly mapped script names:
 script_map['Cyrillic'] = 'Cyrl'
 script_map = {}
 class CodePoint:
 	def __init__(self, x):
 					data[key], linedata = typemap(linedata)
 				yield data
 def parse_property_mapping(ucd_rootdir, propname, reverse=False):
 	ret = {}
 	for data in parse_ucd_data(ucd_rootdir, 'PropertyValueAliases'):
 		if data['Property'] == propname:
 			if reverse:
 				ret[data['Value']] = data['Key']
 			else:
 				ret[data['Key']] = data['Value']
 	return ret
 if __name__ == '__main__':
 	try:
 		items = sys.argv[3].split(',')
 	except:
 		items = None
 	script_map = parse_property_mapping(sys.argv[1], 'sc', reverse=True)
 	for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
 		if items:
 			print ','.join([str(entry[item]) for item in items])
 		else:
 			print entry
 else:
 	script_map = parse_property_mapping('data/ucd', 'sc', reverse=True)