data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
ucd-update: tools/case.py tools/categories.py tools/scripts.py | ucd-update: tools/case.py tools/categories.py tools/scripts.py | ||||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/case.cpp | |||||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/categories.cpp | |||||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/scripts.cpp | |||||
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/case.cpp | |||||
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/categories.cpp | |||||
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/scripts.cpp | |||||
libucd_includedir = $(includedir)/ucd | libucd_includedir = $(includedir)/ucd | ||||
libucd_include_HEADERS = \ | libucd_include_HEADERS = \ | ||||
data/ucd/UnicodeData.txt \ | data/ucd/UnicodeData.txt \ | ||||
data/ucd/PropList.txt \ | data/ucd/PropList.txt \ | ||||
data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
tools/printdata.py ${UCD_ROOTDIR} > $@ | |||||
tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@ | |||||
tests/unicode-data.actual: tests/printucddata | tests/unicode-data.actual: tests/printucddata | ||||
tests/printucddata > $@ | tests/printucddata > $@ |
The following data sets are used for the data tables: | The following data sets are used for the data tables: | ||||
- [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/). | - [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/). | ||||
## ConScript Unicode Registry | |||||
If enabled, the following data from the | |||||
[ConScript Unicode Registry](http://www.evertype.com/standards/csur/) (CSUR) is | |||||
added: | |||||
| Code Range | Script | | |||||
|------------|---------| | |||||
| F8D0-F8FF | [Klingon](http://www.evertype.com/standards/csur/klingon.html) | | |||||
This data is located in the `data/csur` directory. | |||||
## Build Dependencies | ## Build Dependencies | ||||
In order to build ucd-tools, you need: | In order to build ucd-tools, you need: | ||||
where `VERSION` is the Unicode version (e.g. `6.3.0`). | where `VERSION` is the Unicode version (e.g. `6.3.0`). | ||||
Additionally, you can use the `UCD_FLAGS` option to control how the data is | |||||
generated. The following flags are supported: | |||||
| Flag | Description | | |||||
|-------------|-------------| | |||||
| --with-csur | Add ConScript Unicode Registry data. | | |||||
## Bugs | ## Bugs | ||||
Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues) | Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues) |
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
for codepoint in data['CodePoint']: | for codepoint in data['CodePoint']: | ||||
unicode_chars[codepoint] = data['GeneralCategory'] | unicode_chars[codepoint] = data['GeneralCategory'] | ||||
if '--with-csur' in sys.argv: | |||||
for csur in ['Klingon']: | |||||
for data in ucd.parse_ucd_data('data/csur', csur): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data['GeneralCategory'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
# data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that |
import ucd | import ucd | ||||
ucd_rootdir = sys.argv[1] | ucd_rootdir = sys.argv[1] | ||||
csur_rootdir = 'data/csur' | |||||
unicode_chars = {} | unicode_chars = {} | ||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint]['Script'] = data['Script'] | unicode_chars[codepoint]['Script'] = data['Script'] | ||||
if '--with-csur' in sys.argv: | |||||
for csur in ['Klingon']: | |||||
for data in ucd.parse_ucd_data('data/csur', csur): | |||||
for codepoint in data['CodePoint']: | |||||
if not 'TitleCase' in data: data['TitleCase'] = codepoint | |||||
if not 'UpperCase' in data: data['UpperCase'] = codepoint | |||||
if not 'LowerCase' in data: data['LowerCase'] = codepoint | |||||
if not 'Properties' in data: data['Properties'] = [] | |||||
unicode_chars[codepoint] = data | |||||
null = ucd.CodePoint('0000') | null = ucd.CodePoint('0000') | ||||
if __name__ == '__main__': | if __name__ == '__main__': |
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
unicode_chars[codepoint] = data['Script'] | unicode_chars[codepoint] = data['Script'] | ||||
if '--with-csur' in sys.argv: | |||||
for csur in ['Klingon']: | |||||
for data in ucd.parse_ucd_data('data/csur', csur): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data['Script'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
# data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that |
return x, [] | return x, [] | ||||
data_items = { | data_items = { | ||||
# Unicode Character Data: | |||||
'Blocks': [ | 'Blocks': [ | ||||
('Range', codepoint), | ('Range', codepoint), | ||||
('Name', string) | ('Name', string) | ||||
('LowerCase', codepoint), | ('LowerCase', codepoint), | ||||
('TitleCase', codepoint), | ('TitleCase', codepoint), | ||||
], | ], | ||||
# Supplemental Data: | |||||
# ConScript Unicode Registry Data: | |||||
'Klingon': [ | 'Klingon': [ | ||||
('CodePoint', codepoint), | ('CodePoint', codepoint), | ||||
('Script', string), | ('Script', string), |