Browse Source

Do not include supplementary data in the UCD APIs.

This removes support for the CSUR (ConScript Unicode Registry) data
in the main Unicode APIs. This data should be accessed through a
different API.
master
Reece H. Dunn 11 years ago
parent
commit
ced06ed0f4
6 changed files with 46 additions and 76 deletions
  1. 0
    3
      Makefile.am
  2. 1
    2
      README.md
  3. 21
    21
      src/categories.cpp
  4. 21
    41
      src/scripts.cpp
  5. 2
    5
      tools/categories.py
  6. 1
    4
      tools/scripts.py

+ 0
- 3
Makefile.am View File

@@ -86,11 +86,9 @@ tools/case.py: tools/ucd.py \
data/ucd/UnicodeData.txt

tools/categories.py: tools/ucd.py \
supplemental/Klingon.txt \
data/ucd/UnicodeData.txt

tools/scripts.py: tools/ucd.py \
supplemental/Klingon.txt \
data/ucd/Scripts.txt

ucd-update: tools/case.py tools/categories.py tools/scripts.py
@@ -119,7 +117,6 @@ tests_printucddata_SOURCES = tests/printucddata.cpp
tests_printucddata_LDADD = src/libucd.la

tests/unicode-data.expected: tools/printdata.py tools/ucd.py \
supplemental/Klingon.txt \
data/ucd/UnicodeData.txt \
data/ucd/PropList.txt \
data/ucd/Scripts.txt

+ 1
- 2
README.md View File

@@ -22,8 +22,7 @@ In addition to this it provides APIs for:
- converting to upper, lower and title case.

The following data sets are used for the data tables:
- [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/);
- [Klingon](http://www.evertype.com/standards/csur/klingon.html) [F8D0-F8FF].
- [Unicode Character Data 7.0.0](http://www.unicode.org/Public/7.0.0/ucd/).

## Build Dependencies


+ 21
- 21
src/categories.cpp View File

@@ -49,6 +49,26 @@ static const uint8_t categories_Cn[256] =
/* F0 */ Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn, Cn,
};

static const uint8_t categories_Co[256] =
{
/* 00 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 10 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 20 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 30 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 40 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 50 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 60 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 70 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 80 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 90 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* A0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* B0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* C0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* D0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* E0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* F0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
};

static const uint8_t categories_Lo[256] =
{
/* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo,
@@ -1249,26 +1269,6 @@ static const uint8_t categories_00D700[256] =
/* F0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Cn, Cn, Cn, Cn,
};

static const uint8_t categories_00F800[256] =
{
/* 00 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 10 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 20 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 30 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 40 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 50 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 60 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 70 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 80 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* 90 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* A0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* B0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* C0 */ Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co, Co,
/* D0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo,
/* E0 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Co, Co, Co, Co, Co, Co,
/* F0 */ Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Co, Co, Co, Po, Po, So,
};

static const uint8_t categories_00FA00[256] =
{
/* 00 */ Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo, Lo,
@@ -2691,7 +2691,7 @@ static const uint8_t *categories_000000_00D7FF[] =

static const uint8_t *categories_00F800_02FAFF[] =
{
categories_00F800,
categories_Co, // 00F800
categories_Lo, // 00F900
categories_00FA00,
categories_00FB00,

+ 21
- 41
src/scripts.cpp View File

@@ -189,6 +189,26 @@ static const uint8_t scripts_Vaii[256] =
/* F0 */ Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii, Vaii,
};

static const uint8_t scripts_Zzzz[256] =
{
/* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* D0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* E0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* F0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
};

static const uint8_t scripts_Arab[256] =
{
/* 00 */ Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab, Arab,
@@ -229,26 +249,6 @@ static const uint8_t scripts_Lina[256] =
/* F0 */ Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina, Lina,
};

static const uint8_t scripts_Zzzz[256] =
{
/* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* D0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* E0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* F0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
};

static const uint8_t scripts_Xsux[256] =
{
/* 00 */ Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux, Xsux,
@@ -1309,26 +1309,6 @@ static const uint8_t scripts_00D700[256] =
/* F0 */ Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Hang, Zzzz, Zzzz, Zzzz, Zzzz,
};

static const uint8_t scripts_00F800[256] =
{
/* 00 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 10 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 20 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 30 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 40 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 50 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 60 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 70 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 80 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* 90 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* A0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* B0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* C0 */ Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* D0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak,
/* E0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz, Zzzz,
/* F0 */ Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Qaak, Zzzz, Zzzz, Zzzz, Zyyy, Zyyy, Zyyy,
};

static const uint8_t scripts_00FA00[256] =
{
/* 00 */ Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant, Hant,
@@ -2751,7 +2731,7 @@ static const uint8_t *scripts_000000_00D7FF[] =

static const uint8_t *scripts_00F800_02FAFF[] =
{
scripts_00F800,
scripts_Zzzz, // 00F800
scripts_Hant, // 00F900
scripts_00FA00,
scripts_00FB00,

+ 2
- 5
tools/categories.py View File

@@ -1,6 +1,6 @@
#!/usr/bin/python

# Copyright (C) 2012 Reece H. Dunn
# Copyright (C) 2012, 2014 Reece H. Dunn
#
# This file is part of ucd-tools.
#
@@ -28,9 +28,6 @@ unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['GeneralCategory']
for data in ucd.parse_ucd_data('supplemental', 'Klingon'):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
@@ -52,7 +49,7 @@ category_sets = [
# These categories have many pages consisting of just this category:
# Cn -- Unassigned
# Lo -- CJK Ideographs
special_categories = ['Cn', 'Lo', 'Sm', 'So']
special_categories = ['Cn', 'Co', 'Lo', 'Sm', 'So']

category_tables = {}
for codepoints, category, comment in category_sets:

+ 1
- 4
tools/scripts.py View File

@@ -1,6 +1,6 @@
#!/usr/bin/python

# Copyright (C) 2012 Reece H. Dunn
# Copyright (C) 2012, 2014 Reece H. Dunn
#
# This file is part of ucd-tools.
#
@@ -28,9 +28,6 @@ unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
for codepoint in data['Range']:
unicode_chars[codepoint] = data['Script']
for data in ucd.parse_ucd_data('supplemental', 'Klingon'):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that

Loading…
Cancel
Save