10 years ago · bcf8be59b3
--- a/Makefile.am
+++ b/Makefile.am
@@ -92,9 +92,9 @@ tools/scripts.py: tools/ucd.py \
 	data/ucd/Scripts.txt

 ucd-update: tools/case.py tools/categories.py tools/scripts.py
 	tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/case.cpp
 	tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/categories.cpp
 	tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/scripts.cpp
 	tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/case.cpp
 	tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/categories.cpp
 	tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/scripts.cpp

 libucd_includedir = $(includedir)/ucd
 libucd_include_HEADERS = \
@@ -120,7 +120,7 @@ tests/unicode-data.expected: tools/printdata.py tools/ucd.py \
 	data/ucd/UnicodeData.txt \
 	data/ucd/PropList.txt \
 	data/ucd/Scripts.txt
 	tools/printdata.py ${UCD_ROOTDIR} > $@
 	tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@

 tests/unicode-data.actual: tests/printucddata
 	tests/printucddata > $@
--- a/README.md
+++ b/README.md
@@ -24,6 +24,18 @@ In addition to this it provides APIs for:
 The following data sets are used for the data tables:
 -  [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/).

 ## ConScript Unicode Registry

 If enabled, the following data from the
 [ConScript Unicode Registry](http://www.evertype.com/standards/csur/) (CSUR) is
 added:

 | Code Range | Script  |
 |------------|---------|
 | F8D0-F8FF  | [Klingon](http://www.evertype.com/standards/csur/klingon.html) |

 This data is located in the `data/csur` directory.

 ## Build Dependencies

 In order to build ucd-tools, you need:
@@ -84,6 +96,13 @@ unicode is released, you need to run:

 where `VERSION` is the Unicode version (e.g. `6.3.0`).

 Additionally, you can use the `UCD_FLAGS` option to control how the data is
 generated. The following flags are supported:

 | Flag        | Description |
 |-------------|-------------|
 | --with-csur | Add ConScript Unicode Registry data. |

 ## Bugs

 Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues)
--- a/tools/categories.py
+++ b/tools/categories.py
@@ -28,6 +28,11 @@ unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	for codepoint in data['CodePoint']:
 		unicode_chars[codepoint] = data['GeneralCategory']
 if '--with-csur' in sys.argv:
 	for csur in ['Klingon']:
 		for data in ucd.parse_ucd_data('data/csur', csur):
 			for codepoint in data['CodePoint']:
 				unicode_chars[codepoint] = data['GeneralCategory']

 # This map is a combination of the information in the UnicodeData and Blocks
 # data files. It is intended to reduce the number of character tables that
--- a/tools/printdata.py
+++ b/tools/printdata.py
@@ -22,6 +22,7 @@ import sys
 import ucd

 ucd_rootdir = sys.argv[1]
 csur_rootdir = 'data/csur'

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
@@ -35,6 +36,15 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
 	for codepoint in data['Range']:
 		unicode_chars[codepoint]['Script'] = data['Script']
 if '--with-csur' in sys.argv:
 	for csur in ['Klingon']:
 		for data in ucd.parse_ucd_data('data/csur', csur):
 			for codepoint in data['CodePoint']:
 				if not 'TitleCase'  in data: data['TitleCase']  = codepoint
 				if not 'UpperCase'  in data: data['UpperCase']  = codepoint
 				if not 'LowerCase'  in data: data['LowerCase']  = codepoint
 				if not 'Properties' in data: data['Properties'] = []
 				unicode_chars[codepoint] = data

 null = ucd.CodePoint('0000')
 if __name__ == '__main__':
--- a/tools/scripts.py
+++ b/tools/scripts.py
@@ -28,6 +28,11 @@ unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
 	for codepoint in data['Range']:
 		unicode_chars[codepoint] = data['Script']
 if '--with-csur' in sys.argv:
 	for csur in ['Klingon']:
 		for data in ucd.parse_ucd_data('data/csur', csur):
 			for codepoint in data['CodePoint']:
 				unicode_chars[codepoint] = data['Script']

 # This map is a combination of the information in the UnicodeData and Blocks
 # data files. It is intended to reduce the number of character tables that
--- a/tools/ucd.py
+++ b/tools/ucd.py
@@ -98,6 +98,7 @@ def strlist(x):
 	return x, []

 data_items = {
 	# Unicode Character Data:
 	'Blocks': [
 		('Range', codepoint),
 		('Name', string)
@@ -137,7 +138,7 @@ data_items = {
 		('LowerCase', codepoint),
 		('TitleCase', codepoint),
 	],
 	# Supplemental Data:
 	# ConScript Unicode Registry Data:
 	'Klingon': [
 		('CodePoint', codepoint),
 		('Script', string),