@@ -13,6 +13,13 @@ | |||
# build output: | |||
src/libucd.la | |||
tests/printucddata | |||
# test output: | |||
tests/*.expected | |||
tests/*.actual | |||
tests/*.diff | |||
# autotools output: | |||
@@ -66,11 +66,27 @@ libucd_includedir = $(includedir)/ucd | |||
libucd_include_HEADERS = \ | |||
src/include/ucd/ucd.h | |||
lib_LTLIBRARIES += src/libucd.la | |||
src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION) | |||
lib_LTLIBRARIES += src/libucd.la | |||
src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION) | |||
src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS} | |||
src_libucd_la_SOURCES = \ | |||
src_libucd_la_SOURCES = \ | |||
src/case.cpp \ | |||
src/categories.cpp \ | |||
src/ctype.cpp | |||
############################# tests ########################################### | |||
noinst_bin_PROGRAMS += tests/printucddata | |||
tests_printucddata_SOURCES = tests/printucddata.cpp | |||
tests_printucddata_LDADD = src/libucd.la | |||
tests/unicode-data.expected: tools/printdata.py tools/ucd.py | |||
tools/printdata.py ${UCD_ROOTDIR} > $@ | |||
tests/unicode-data.actual: tests/printucddata | |||
tests/printucddata > $@ | |||
tests/unicode-data.diff: tests/unicode-data.expected tests/unicode-data.actual | |||
diff -U0 tests/unicode-data.expected tests/unicode-data.actual > tests/unicode-data.diff | |||
check: tests/unicode-data.diff |
@@ -2140,7 +2140,7 @@ ucd::codepoint_t ucd::toupper(codepoint_t c) | |||
int pos = (begin + end) / 2; | |||
const case_conversion_entry *item = (case_conversion_data + pos); | |||
if (c == item->codepoint) | |||
return item->uppercase; | |||
return item->uppercase == 0 ? c : item->uppercase; | |||
else if (c > item->codepoint) | |||
begin = pos + 1; | |||
else | |||
@@ -2158,7 +2158,7 @@ ucd::codepoint_t ucd::tolower(codepoint_t c) | |||
int pos = (begin + end) / 2; | |||
const case_conversion_entry *item = (case_conversion_data + pos); | |||
if (c == item->codepoint) | |||
return item->lowercase; | |||
return item->lowercase == 0 ? c : item->lowercase; | |||
else if (c > item->codepoint) | |||
begin = pos + 1; | |||
else | |||
@@ -2176,7 +2176,7 @@ ucd::codepoint_t ucd::totitle(codepoint_t c) | |||
int pos = (begin + end) / 2; | |||
const case_conversion_entry *item = (case_conversion_data + pos); | |||
if (c == item->codepoint) | |||
return item->titlecase; | |||
return item->titlecase == 0 ? c : item->titlecase; | |||
else if (c > item->codepoint) | |||
begin = pos + 1; | |||
else |
@@ -0,0 +1,75 @@ | |||
/* | |||
* Copyright (C) 2012 Reece H. Dunn | |||
* | |||
* This file is part of ucd-tools. | |||
* | |||
* ucd-tools is free software: you can redistribute it and/or modify | |||
* it under the terms of the GNU General Public License as published by | |||
* the Free Software Foundation, either version 3 of the License, or | |||
* (at your option) any later version. | |||
* | |||
* ucd-tools is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
* GNU General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU General Public License | |||
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
*/ | |||
#include "ucd/ucd.h" | |||
#include <stdio.h> | |||
const char *get_category_string(ucd::category c) | |||
{ | |||
using namespace ucd; | |||
switch (c) | |||
{ | |||
case Cc: return "Cc"; | |||
case Cf: return "Cf"; | |||
case Cn: return "Cn"; | |||
case Co: return "Co"; | |||
case Cs: return "Cs"; | |||
case Ii: return "Ii"; | |||
case Ll: return "Ll"; | |||
case Lm: return "Lm"; | |||
case Lo: return "Lo"; | |||
case Lt: return "Lt"; | |||
case Lu: return "Lu"; | |||
case Mc: return "Mc"; | |||
case Me: return "Me"; | |||
case Mn: return "Mn"; | |||
case Nd: return "Nd"; | |||
case Nl: return "Nl"; | |||
case No: return "No"; | |||
case Pc: return "Pc"; | |||
case Pd: return "Pd"; | |||
case Pe: return "Pe"; | |||
case Pf: return "Pf"; | |||
case Pi: return "Pi"; | |||
case Po: return "Po"; | |||
case Ps: return "Ps"; | |||
case Sc: return "Sc"; | |||
case Sk: return "Sk"; | |||
case Sm: return "Sm"; | |||
case So: return "So"; | |||
case Zl: return "Zl"; | |||
case Zp: return "Zp"; | |||
case Zs: return "Zs"; | |||
default: return "--"; | |||
} | |||
} | |||
int main() | |||
{ | |||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||
{ | |||
const char *category = get_category_string(ucd::lookup_category(c)); | |||
ucd::codepoint_t upper = ucd::toupper(c); | |||
ucd::codepoint_t lower = ucd::tolower(c); | |||
ucd::codepoint_t title = ucd::totitle(c); | |||
printf("%06X %s %06X %06X %06X\n", c, category, upper, lower, title); | |||
} | |||
return 0; | |||
} |
@@ -90,7 +90,7 @@ struct case_conversion_entry | |||
sys.stdout.write('\t\tint pos = (begin + end) / 2;\n') | |||
sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n') | |||
sys.stdout.write('\t\tif (c == item->codepoint)\n') | |||
sys.stdout.write('\t\t\treturn item->%scase;\n' % case) | |||
sys.stdout.write('\t\t\treturn item->%scase == 0 ? c : item->%scase;\n' % (case, case)) | |||
sys.stdout.write('\t\telse if (c > item->codepoint)\n') | |||
sys.stdout.write('\t\t\tbegin = pos + 1;\n') | |||
sys.stdout.write('\t\telse\n') |
@@ -26,8 +26,8 @@ ucd_version = ucd_rootdir.split('-')[-1] | |||
unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
if isinstance(data['CodePoint'], ucd.CodePoint): | |||
unicode_chars[data['CodePoint']] = data['GeneralCategory'] | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data['GeneralCategory'] | |||
# This map is a combination of the information in the UnicodeData and Blocks | |||
# data files. It is intended to reduce the number of character tables that |
@@ -0,0 +1,44 @@ | |||
#!/usr/bin/python | |||
# Copyright (C) 2012 Reece H. Dunn | |||
# | |||
# This file is part of ucd-tools. | |||
# | |||
# ucd-tools is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 3 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# ucd-tools is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# You should have received a copy of the GNU General Public License | |||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||
import os | |||
import sys | |||
import ucd | |||
ucd_rootdir = sys.argv[1] | |||
unicode_chars = {} | |||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||
for codepoint in data['CodePoint']: | |||
unicode_chars[codepoint] = data | |||
null = ucd.CodePoint('0000') | |||
if __name__ == '__main__': | |||
for codepoint in ucd.CodeRange('000000..10FFFF'): | |||
try: | |||
data = unicode_chars[codepoint] | |||
title = data['TitleCase'] | |||
upper = data['UpperCase'] | |||
lower = data['LowerCase'] | |||
if title == null: title = codepoint | |||
if upper == null: upper = codepoint | |||
if lower == null: lower = codepoint | |||
print '%s %s %s %s %s' % (codepoint, data['GeneralCategory'], upper, lower, title) | |||
except KeyError: | |||
print '%s Cn %s %s %s' % (codepoint, codepoint, codepoint, codepoint) |
@@ -33,6 +33,9 @@ class CodePoint: | |||
def __str__(self): | |||
return '%06X' % self.codepoint | |||
def __iter__(self): | |||
yield self | |||
def __hash__(self): | |||
return self.codepoint | |||