| data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
| ucd-update: tools/case.py tools/categories.py tools/scripts.py | ucd-update: tools/case.py tools/categories.py tools/scripts.py | ||||
| tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/case.cpp | |||||
| tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/categories.cpp | |||||
| tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/scripts.cpp | |||||
| tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/case.cpp | |||||
| tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/categories.cpp | |||||
| tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/scripts.cpp | |||||
| libucd_includedir = $(includedir)/ucd | libucd_includedir = $(includedir)/ucd | ||||
| libucd_include_HEADERS = \ | libucd_include_HEADERS = \ | ||||
| data/ucd/UnicodeData.txt \ | data/ucd/UnicodeData.txt \ | ||||
| data/ucd/PropList.txt \ | data/ucd/PropList.txt \ | ||||
| data/ucd/Scripts.txt | data/ucd/Scripts.txt | ||||
| tools/printdata.py ${UCD_ROOTDIR} > $@ | |||||
| tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@ | |||||
| tests/unicode-data.actual: tests/printucddata | tests/unicode-data.actual: tests/printucddata | ||||
| tests/printucddata > $@ | tests/printucddata > $@ |
| The following data sets are used for the data tables: | The following data sets are used for the data tables: | ||||
| - [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/). | - [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/). | ||||
| ## ConScript Unicode Registry | |||||
| If enabled, the following data from the | |||||
| [ConScript Unicode Registry](http://www.evertype.com/standards/csur/) (CSUR) is | |||||
| added: | |||||
| | Code Range | Script | | |||||
| |------------|---------| | |||||
| | F8D0-F8FF | [Klingon](http://www.evertype.com/standards/csur/klingon.html) | | |||||
| This data is located in the `data/csur` directory. | |||||
| ## Build Dependencies | ## Build Dependencies | ||||
| In order to build ucd-tools, you need: | In order to build ucd-tools, you need: | ||||
| where `VERSION` is the Unicode version (e.g. `6.3.0`). | where `VERSION` is the Unicode version (e.g. `6.3.0`). | ||||
| Additionally, you can use the `UCD_FLAGS` option to control how the data is | |||||
| generated. The following flags are supported: | |||||
| | Flag | Description | | |||||
| |-------------|-------------| | |||||
| | --with-csur | Add ConScript Unicode Registry data. | | |||||
| ## Bugs | ## Bugs | ||||
| Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues) | Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues) |
| for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
| for codepoint in data['CodePoint']: | for codepoint in data['CodePoint']: | ||||
| unicode_chars[codepoint] = data['GeneralCategory'] | unicode_chars[codepoint] = data['GeneralCategory'] | ||||
| if '--with-csur' in sys.argv: | |||||
| for csur in ['Klingon']: | |||||
| for data in ucd.parse_ucd_data('data/csur', csur): | |||||
| for codepoint in data['CodePoint']: | |||||
| unicode_chars[codepoint] = data['GeneralCategory'] | |||||
| # This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
| # data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that |
| import ucd | import ucd | ||||
| ucd_rootdir = sys.argv[1] | ucd_rootdir = sys.argv[1] | ||||
| csur_rootdir = 'data/csur' | |||||
| unicode_chars = {} | unicode_chars = {} | ||||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
| for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
| unicode_chars[codepoint]['Script'] = data['Script'] | unicode_chars[codepoint]['Script'] = data['Script'] | ||||
| if '--with-csur' in sys.argv: | |||||
| for csur in ['Klingon']: | |||||
| for data in ucd.parse_ucd_data('data/csur', csur): | |||||
| for codepoint in data['CodePoint']: | |||||
| if not 'TitleCase' in data: data['TitleCase'] = codepoint | |||||
| if not 'UpperCase' in data: data['UpperCase'] = codepoint | |||||
| if not 'LowerCase' in data: data['LowerCase'] = codepoint | |||||
| if not 'Properties' in data: data['Properties'] = [] | |||||
| unicode_chars[codepoint] = data | |||||
| null = ucd.CodePoint('0000') | null = ucd.CodePoint('0000') | ||||
| if __name__ == '__main__': | if __name__ == '__main__': |
| for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): | ||||
| for codepoint in data['Range']: | for codepoint in data['Range']: | ||||
| unicode_chars[codepoint] = data['Script'] | unicode_chars[codepoint] = data['Script'] | ||||
| if '--with-csur' in sys.argv: | |||||
| for csur in ['Klingon']: | |||||
| for data in ucd.parse_ucd_data('data/csur', csur): | |||||
| for codepoint in data['CodePoint']: | |||||
| unicode_chars[codepoint] = data['Script'] | |||||
| # This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
| # data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that |
| return x, [] | return x, [] | ||||
| data_items = { | data_items = { | ||||
| # Unicode Character Data: | |||||
| 'Blocks': [ | 'Blocks': [ | ||||
| ('Range', codepoint), | ('Range', codepoint), | ||||
| ('Name', string) | ('Name', string) | ||||
| ('LowerCase', codepoint), | ('LowerCase', codepoint), | ||||
| ('TitleCase', codepoint), | ('TitleCase', codepoint), | ||||
| ], | ], | ||||
| # Supplemental Data: | |||||
| # ConScript Unicode Registry Data: | |||||
| 'Klingon': [ | 'Klingon': [ | ||||
| ('CodePoint', codepoint), | ('CodePoint', codepoint), | ||||
| ('Script', string), | ('Script', string), |