Browse Source

Support enabling the CSUR data.

master
Reece H. Dunn 10 years ago
parent
commit
bcf8be59b3
6 changed files with 45 additions and 5 deletions
  1. 4
    4
      Makefile.am
  2. 19
    0
      README.md
  3. 5
    0
      tools/categories.py
  4. 10
    0
      tools/printdata.py
  5. 5
    0
      tools/scripts.py
  6. 2
    1
      tools/ucd.py

+ 4
- 4
Makefile.am View File

@@ -92,9 +92,9 @@ tools/scripts.py: tools/ucd.py \
data/ucd/Scripts.txt

ucd-update: tools/case.py tools/categories.py tools/scripts.py
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/case.cpp
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/categories.cpp
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} > src/scripts.cpp
tools/case.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/case.cpp
tools/categories.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/categories.cpp
tools/scripts.py ${UCD_ROOTDIR} ${UCD_VERSION} ${UCD_FLAGS} > src/scripts.cpp

libucd_includedir = $(includedir)/ucd
libucd_include_HEADERS = \
@@ -120,7 +120,7 @@ tests/unicode-data.expected: tools/printdata.py tools/ucd.py \
data/ucd/UnicodeData.txt \
data/ucd/PropList.txt \
data/ucd/Scripts.txt
tools/printdata.py ${UCD_ROOTDIR} > $@
tools/printdata.py ${UCD_ROOTDIR} ${UCD_FLAGS} > $@

tests/unicode-data.actual: tests/printucddata
tests/printucddata > $@

+ 19
- 0
README.md View File

@@ -24,6 +24,18 @@ In addition to this it provides APIs for:
The following data sets are used for the data tables:
- [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/).

## ConScript Unicode Registry

If enabled, the following data from the
[ConScript Unicode Registry](http://www.evertype.com/standards/csur/) (CSUR) is
added:

| Code Range | Script |
|------------|---------|
| F8D0-F8FF | [Klingon](http://www.evertype.com/standards/csur/klingon.html) |

This data is located in the `data/csur` directory.

## Build Dependencies

In order to build ucd-tools, you need:
@@ -84,6 +96,13 @@ unicode is released, you need to run:

where `VERSION` is the Unicode version (e.g. `6.3.0`).

Additionally, you can use the `UCD_FLAGS` option to control how the data is
generated. The following flags are supported:

| Flag | Description |
|-------------|-------------|
| --with-csur | Add ConScript Unicode Registry data. |

## Bugs

Report bugs to the [ucd-tools issues](https://github.com/rhdunn/ucd-tools/issues)

+ 5
- 0
tools/categories.py View File

@@ -28,6 +28,11 @@ unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['GeneralCategory']
if '--with-csur' in sys.argv:
for csur in ['Klingon']:
for data in ucd.parse_ucd_data('data/csur', csur):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that

+ 10
- 0
tools/printdata.py View File

@@ -22,6 +22,7 @@ import sys
import ucd

ucd_rootdir = sys.argv[1]
csur_rootdir = 'data/csur'

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
@@ -35,6 +36,15 @@ for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
for codepoint in data['Range']:
unicode_chars[codepoint]['Script'] = data['Script']
if '--with-csur' in sys.argv:
for csur in ['Klingon']:
for data in ucd.parse_ucd_data('data/csur', csur):
for codepoint in data['CodePoint']:
if not 'TitleCase' in data: data['TitleCase'] = codepoint
if not 'UpperCase' in data: data['UpperCase'] = codepoint
if not 'LowerCase' in data: data['LowerCase'] = codepoint
if not 'Properties' in data: data['Properties'] = []
unicode_chars[codepoint] = data

null = ucd.CodePoint('0000')
if __name__ == '__main__':

+ 5
- 0
tools/scripts.py View File

@@ -28,6 +28,11 @@ unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
for codepoint in data['Range']:
unicode_chars[codepoint] = data['Script']
if '--with-csur' in sys.argv:
for csur in ['Klingon']:
for data in ucd.parse_ucd_data('data/csur', csur):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that

+ 2
- 1
tools/ucd.py View File

@@ -98,6 +98,7 @@ def strlist(x):
return x, []

data_items = {
# Unicode Character Data:
'Blocks': [
('Range', codepoint),
('Name', string)
@@ -137,7 +138,7 @@ data_items = {
('LowerCase', codepoint),
('TitleCase', codepoint),
],
# Supplemental Data:
# ConScript Unicode Registry Data:
'Klingon': [
('CodePoint', codepoint),
('Script', string),

Loading…
Cancel
Save