@@ -92,9 +92,9 @@ tools/scripts.py: tools/ucd.py \ | |||
data/ucd/Scripts.txt | |||
ucd-update: tools/case.py tools/categories.py tools/scripts.py | |||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/case.cpp | |||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/categories.cpp | |||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/scripts.cpp | |||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/case.cpp | |||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/categories.cpp | |||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/scripts.cpp | |||
libucd_includedir = $(includedir)/ucd | |||
libucd_include_HEADERS = \ | |||
@@ -120,7 +120,7 @@ tests/unicode-data.expected: tools/printdata.py tools/ucd.py \ | |||
data/ucd/UnicodeData.txt \ | |||
data/ucd/PropList.txt \ | |||
data/ucd/Scripts.txt | |||
tools/printdata.py ${UCD_ROOTDIR} > $@ | |||
tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@ | |||
tests/unicode-data.actual: tests/printucddata | |||
tests/printucddata > $@ |
@@ -24,6 +24,18 @@ In addition to this it provides APIs for: | |||
The following data sets are used for the data tables: | |||
- [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/). | |||
## ConScript Unicode Registry | |||
If enabled, the following data from the | |||
[ConScript Unicode Registry](http://www.evertype.com/standards/csur/) (CSUR) is | |||
added: | |||
| Code Range | Script | | |||
|------------|---------| | |||
| F8D0-F8FF | [Klingon](http://www.evertype.com/standards/csur/klingon.html) | | |||
This data is located in the `data/csur` directory. | |||
## Build Dependencies | |||
In order to build ucd-tools, you need: | |||
@@ -84,6 +96,13 @@ unicode is released, you need to run: | |||
where `VERSION` is the Unicode version (e.g. `6.3.0`). | |||
Additionally, you can use the `UCD_FLAGS` option to control how the data is | |||
generated. The following flags are supported: | |||
| Flag | Description | | |||
|-------------|-------------| | |||
| --with-csur | Add ConScript Unicode Registry data. | | |||
## Bugs | |||
Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues) |
@@ -28,6 +28,11 @@ unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['GeneralCategory'] | |||
if '--with-csur' in sys.argv: | |||
for csur in ['Klingon']: | |||
for data in ucd.parse_ucd_data('data/csur', csur): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['GeneralCategory'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that |
@@ -22,6 +22,7 @@ import sys | |||
import ucd | |||
ucd_rootdir = sys.argv[1] | |||
csur_rootdir = 'data/csur' | |||
unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
@@ -35,6 +36,15 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'): | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint]['Script'] = data['Script'] | |||
if '--with-csur' in sys.argv: | |||
for csur in ['Klingon']: | |||
for data in ucd.parse_ucd_data('data/csur', csur): | |||
for codepoint in data['CodePoint']: | |||
if not 'TitleCase' in data: data['TitleCase'] = codepoint | |||
if not 'UpperCase' in data: data['UpperCase'] = codepoint | |||
if not 'LowerCase' in data: data['LowerCase'] = codepoint | |||
if not 'Properties' in data: data['Properties'] = [] | |||
unicode_chars[codepoint] = data | |||
null = ucd.CodePoint('0000') | |||
if __name__ == '__main__': |
@@ -28,6 +28,11 @@ unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | |||
for codepoint in data['Range']: | |||
unicode_chars[codepoint] = data['Script'] | |||
if '--with-csur' in sys.argv: | |||
for csur in ['Klingon']: | |||
for data in ucd.parse_ucd_data('data/csur', csur): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['Script'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that |
@@ -98,6 +98,7 @@ def strlist(x): | |||
return x, [] | |||
data_items = { | |||
# Unicode Character Data: | |||
'Blocks': [ | |||
('Range', codepoint), | |||
('Name', string) | |||
@@ -137,7 +138,7 @@ data_items = { | |||
('LowerCase', codepoint), | |||
('TitleCase', codepoint), | |||
], | |||
# Supplemental Data: | |||
# ConScript Unicode Registry Data: | |||
'Klingon': [ | |||
('CodePoint', codepoint), | |||
('Script', string), |