# build output: | # build output: | ||||
src/libucd.la | src/libucd.la | ||||
tests/printucddata | |||||
# test output: | |||||
tests/*.expected | |||||
tests/*.actual | |||||
tests/*.diff | |||||
# autotools output: | # autotools output: | ||||
libucd_include_HEADERS = \ | libucd_include_HEADERS = \ | ||||
src/include/ucd/ucd.h | src/include/ucd/ucd.h | ||||
lib_LTLIBRARIES += src/libucd.la | |||||
src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION) | |||||
lib_LTLIBRARIES += src/libucd.la | |||||
src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION) | |||||
src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS} | src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS} | ||||
src_libucd_la_SOURCES = \ | |||||
src_libucd_la_SOURCES = \ | |||||
src/case.cpp \ | src/case.cpp \ | ||||
src/categories.cpp \ | src/categories.cpp \ | ||||
src/ctype.cpp | src/ctype.cpp | ||||
############################# tests ########################################### | |||||
noinst_bin_PROGRAMS += tests/printucddata | |||||
tests_printucddata_SOURCES = tests/printucddata.cpp | |||||
tests_printucddata_LDADD = src/libucd.la | |||||
tests/unicode-data.expected: tools/printdata.py tools/ucd.py | |||||
tools/printdata.py ${UCD_ROOTDIR} > $@ | |||||
tests/unicode-data.actual: tests/printucddata | |||||
tests/printucddata > $@ | |||||
tests/unicode-data.diff: tests/unicode-data.expected tests/unicode-data.actual | |||||
diff -U0 tests/unicode-data.expected tests/unicode-data.actual > tests/unicode-data.diff | |||||
check: tests/unicode-data.diff |
int pos = (begin + end) / 2; | int pos = (begin + end) / 2; | ||||
const case_conversion_entry *item = (case_conversion_data + pos); | const case_conversion_entry *item = (case_conversion_data + pos); | ||||
if (c == item->codepoint) | if (c == item->codepoint) | ||||
return item->uppercase; | |||||
return item->uppercase == 0 ? c : item->uppercase; | |||||
else if (c > item->codepoint) | else if (c > item->codepoint) | ||||
begin = pos + 1; | begin = pos + 1; | ||||
else | else | ||||
int pos = (begin + end) / 2; | int pos = (begin + end) / 2; | ||||
const case_conversion_entry *item = (case_conversion_data + pos); | const case_conversion_entry *item = (case_conversion_data + pos); | ||||
if (c == item->codepoint) | if (c == item->codepoint) | ||||
return item->lowercase; | |||||
return item->lowercase == 0 ? c : item->lowercase; | |||||
else if (c > item->codepoint) | else if (c > item->codepoint) | ||||
begin = pos + 1; | begin = pos + 1; | ||||
else | else | ||||
int pos = (begin + end) / 2; | int pos = (begin + end) / 2; | ||||
const case_conversion_entry *item = (case_conversion_data + pos); | const case_conversion_entry *item = (case_conversion_data + pos); | ||||
if (c == item->codepoint) | if (c == item->codepoint) | ||||
return item->titlecase; | |||||
return item->titlecase == 0 ? c : item->titlecase; | |||||
else if (c > item->codepoint) | else if (c > item->codepoint) | ||||
begin = pos + 1; | begin = pos + 1; | ||||
else | else |
/* | |||||
* Copyright (C) 2012 Reece H. Dunn | |||||
* | |||||
* This file is part of ucd-tools. | |||||
* | |||||
* ucd-tools is free software: you can redistribute it and/or modify | |||||
* it under the terms of the GNU General Public License as published by | |||||
* the Free Software Foundation, either version 3 of the License, or | |||||
* (at your option) any later version. | |||||
* | |||||
* ucd-tools is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
* GNU General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU General Public License | |||||
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
*/ | |||||
#include "ucd/ucd.h" | |||||
#include <stdio.h> | |||||
const char *get_category_string(ucd::category c) | |||||
{ | |||||
using namespace ucd; | |||||
switch (c) | |||||
{ | |||||
case Cc: return "Cc"; | |||||
case Cf: return "Cf"; | |||||
case Cn: return "Cn"; | |||||
case Co: return "Co"; | |||||
case Cs: return "Cs"; | |||||
case Ii: return "Ii"; | |||||
case Ll: return "Ll"; | |||||
case Lm: return "Lm"; | |||||
case Lo: return "Lo"; | |||||
case Lt: return "Lt"; | |||||
case Lu: return "Lu"; | |||||
case Mc: return "Mc"; | |||||
case Me: return "Me"; | |||||
case Mn: return "Mn"; | |||||
case Nd: return "Nd"; | |||||
case Nl: return "Nl"; | |||||
case No: return "No"; | |||||
case Pc: return "Pc"; | |||||
case Pd: return "Pd"; | |||||
case Pe: return "Pe"; | |||||
case Pf: return "Pf"; | |||||
case Pi: return "Pi"; | |||||
case Po: return "Po"; | |||||
case Ps: return "Ps"; | |||||
case Sc: return "Sc"; | |||||
case Sk: return "Sk"; | |||||
case Sm: return "Sm"; | |||||
case So: return "So"; | |||||
case Zl: return "Zl"; | |||||
case Zp: return "Zp"; | |||||
case Zs: return "Zs"; | |||||
default: return "--"; | |||||
} | |||||
} | |||||
int main() | |||||
{ | |||||
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c) | |||||
{ | |||||
const char *category = get_category_string(ucd::lookup_category(c)); | |||||
ucd::codepoint_t upper = ucd::toupper(c); | |||||
ucd::codepoint_t lower = ucd::tolower(c); | |||||
ucd::codepoint_t title = ucd::totitle(c); | |||||
printf("%06X %s %06X %06X %06X\n", c, category, upper, lower, title); | |||||
} | |||||
return 0; | |||||
} |
sys.stdout.write('\t\tint pos = (begin + end) / 2;\n') | sys.stdout.write('\t\tint pos = (begin + end) / 2;\n') | ||||
sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n') | sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n') | ||||
sys.stdout.write('\t\tif (c == item->codepoint)\n') | sys.stdout.write('\t\tif (c == item->codepoint)\n') | ||||
sys.stdout.write('\t\t\treturn item->%scase;\n' % case) | |||||
sys.stdout.write('\t\t\treturn item->%scase == 0 ? c : item->%scase;\n' % (case, case)) | |||||
sys.stdout.write('\t\telse if (c > item->codepoint)\n') | sys.stdout.write('\t\telse if (c > item->codepoint)\n') | ||||
sys.stdout.write('\t\t\tbegin = pos + 1;\n') | sys.stdout.write('\t\t\tbegin = pos + 1;\n') | ||||
sys.stdout.write('\t\telse\n') | sys.stdout.write('\t\telse\n') |
unicode_chars = {} | unicode_chars = {} | ||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | ||||
if isinstance(data['CodePoint'], ucd.CodePoint): | |||||
unicode_chars[data['CodePoint']] = data['GeneralCategory'] | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data['GeneralCategory'] | |||||
# This map is a combination of the information in the UnicodeData and Blocks | # This map is a combination of the information in the UnicodeData and Blocks | ||||
# data files. It is intended to reduce the number of character tables that | # data files. It is intended to reduce the number of character tables that |
#!/usr/bin/python | |||||
# Copyright (C) 2012 Reece H. Dunn | |||||
# | |||||
# This file is part of ucd-tools. | |||||
# | |||||
# ucd-tools is free software: you can redistribute it and/or modify | |||||
# it under the terms of the GNU General Public License as published by | |||||
# the Free Software Foundation, either version 3 of the License, or | |||||
# (at your option) any later version. | |||||
# | |||||
# ucd-tools is distributed in the hope that it will be useful, | |||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
# GNU General Public License for more details. | |||||
# | |||||
# You should have received a copy of the GNU General Public License | |||||
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>. | |||||
import os | |||||
import sys | |||||
import ucd | |||||
ucd_rootdir = sys.argv[1] | |||||
unicode_chars = {} | |||||
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): | |||||
for codepoint in data['CodePoint']: | |||||
unicode_chars[codepoint] = data | |||||
null = ucd.CodePoint('0000') | |||||
if __name__ == '__main__': | |||||
for codepoint in ucd.CodeRange('000000..10FFFF'): | |||||
try: | |||||
data = unicode_chars[codepoint] | |||||
title = data['TitleCase'] | |||||
upper = data['UpperCase'] | |||||
lower = data['LowerCase'] | |||||
if title == null: title = codepoint | |||||
if upper == null: upper = codepoint | |||||
if lower == null: lower = codepoint | |||||
print '%s %s %s %s %s' % (codepoint, data['GeneralCategory'], upper, lower, title) | |||||
except KeyError: | |||||
print '%s Cn %s %s %s' % (codepoint, codepoint, codepoint, codepoint) |
def __str__(self): | def __str__(self): | ||||
return '%06X' % self.codepoint | return '%06X' % self.codepoint | ||||
def __iter__(self): | |||||
yield self | |||||
def __hash__(self): | def __hash__(self): | ||||
return self.codepoint | return self.codepoint | ||||