12 years ago · ea09eb5c45
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,13 @@
 # build output:

 src/libucd.la
 tests/printucddata

 # test output:

 tests/*.expected
 tests/*.actual
 tests/*.diff

 # autotools output:

--- a/Makefile.am
+++ b/Makefile.am
@@ -66,11 +66,27 @@ libucd_includedir = $(includedir)/ucd
 libucd_include_HEADERS = \
 	src/include/ucd/ucd.h

 lib_LTLIBRARIES += src/libucd.la

 src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION)
 lib_LTLIBRARIES       += src/libucd.la
 src_libucd_la_LDFLAGS  = -version-info $(LIBUCD_VERSION)
 src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS}
 src_libucd_la_SOURCES = \
 src_libucd_la_SOURCES  = \
 	src/case.cpp \
 	src/categories.cpp \
 	src/ctype.cpp

 ############################# tests ###########################################

 noinst_bin_PROGRAMS       += tests/printucddata
 tests_printucddata_SOURCES = tests/printucddata.cpp
 tests_printucddata_LDADD   = src/libucd.la

 tests/unicode-data.expected: tools/printdata.py tools/ucd.py
 	tools/printdata.py ${UCD_ROOTDIR} > $@

 tests/unicode-data.actual: tests/printucddata
 	tests/printucddata > $@

 tests/unicode-data.diff: tests/unicode-data.expected tests/unicode-data.actual
 	diff -U0 tests/unicode-data.expected tests/unicode-data.actual > tests/unicode-data.diff

 check: tests/unicode-data.diff
--- a/src/case.cpp
+++ b/src/case.cpp
@@ -2140,7 +2140,7 @@ ucd::codepoint_t ucd::toupper(codepoint_t c)
 		int pos = (begin + end) / 2;
 		const case_conversion_entry *item = (case_conversion_data + pos);
 		if (c == item->codepoint)
 			return item->uppercase;
 			return item->uppercase == 0 ? c : item->uppercase;
 		else if (c > item->codepoint)
 			begin = pos + 1;
 		else
@@ -2158,7 +2158,7 @@ ucd::codepoint_t ucd::tolower(codepoint_t c)
 		int pos = (begin + end) / 2;
 		const case_conversion_entry *item = (case_conversion_data + pos);
 		if (c == item->codepoint)
 			return item->lowercase;
 			return item->lowercase == 0 ? c : item->lowercase;
 		else if (c > item->codepoint)
 			begin = pos + 1;
 		else
@@ -2176,7 +2176,7 @@ ucd::codepoint_t ucd::totitle(codepoint_t c)
 		int pos = (begin + end) / 2;
 		const case_conversion_entry *item = (case_conversion_data + pos);
 		if (c == item->codepoint)
 			return item->titlecase;
 			return item->titlecase == 0 ? c : item->titlecase;
 		else if (c > item->codepoint)
 			begin = pos + 1;
 		else
--- a/src/categories.cpp
+++ b/src/categories.cpp
--- a/tests/printucddata.cpp
+++ b/tests/printucddata.cpp
@@ -0,0 +1,75 @@
 /*
 * Copyright (C) 2012 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */

 #include "ucd/ucd.h"

 #include <stdio.h>

 const char *get_category_string(ucd::category c)
 {
 	using namespace ucd;
 	switch (c)
 	{
 	case Cc: return "Cc";
 	case Cf: return "Cf";
 	case Cn: return "Cn";
 	case Co: return "Co";
 	case Cs: return "Cs";
 	case Ii: return "Ii";
 	case Ll: return "Ll";
 	case Lm: return "Lm";
 	case Lo: return "Lo";
 	case Lt: return "Lt";
 	case Lu: return "Lu";
 	case Mc: return "Mc";
 	case Me: return "Me";
 	case Mn: return "Mn";
 	case Nd: return "Nd";
 	case Nl: return "Nl";
 	case No: return "No";
 	case Pc: return "Pc";
 	case Pd: return "Pd";
 	case Pe: return "Pe";
 	case Pf: return "Pf";
 	case Pi: return "Pi";
 	case Po: return "Po";
 	case Ps: return "Ps";
 	case Sc: return "Sc";
 	case Sk: return "Sk";
 	case Sm: return "Sm";
 	case So: return "So";
 	case Zl: return "Zl";
 	case Zp: return "Zp";
 	case Zs: return "Zs";
 	default: return "--";
 	}
 }

 int main()
 {
 	for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
 	{
 		const char *category = get_category_string(ucd::lookup_category(c));
 		ucd::codepoint_t upper = ucd::toupper(c);
 		ucd::codepoint_t lower = ucd::tolower(c);
 		ucd::codepoint_t title = ucd::totitle(c);
 		printf("%06X %s %06X %06X %06X\n", c, category, upper, lower, title);
 	}
 	return 0;
 }
--- a/tools/case.py
+++ b/tools/case.py
@@ -90,7 +90,7 @@ struct case_conversion_entry
 		sys.stdout.write('\t\tint pos = (begin + end) / 2;\n')
 		sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n')
 		sys.stdout.write('\t\tif (c == item->codepoint)\n')
 		sys.stdout.write('\t\t\treturn item->%scase;\n' % case)
 		sys.stdout.write('\t\t\treturn item->%scase == 0 ? c : item->%scase;\n' % (case, case))
 		sys.stdout.write('\t\telse if (c > item->codepoint)\n')
 		sys.stdout.write('\t\t\tbegin = pos + 1;\n')
 		sys.stdout.write('\t\telse\n')
--- a/tools/categories.py
+++ b/tools/categories.py
@@ -26,8 +26,8 @@ ucd_version = ucd_rootdir.split('-')[-1]

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	if isinstance(data['CodePoint'], ucd.CodePoint):
 		unicode_chars[data['CodePoint']] = data['GeneralCategory']
 	for codepoint in data['CodePoint']:
 		unicode_chars[codepoint] = data['GeneralCategory']

 # This map is a combination of the information in the UnicodeData and Blocks
 # data files. It is intended to reduce the number of character tables that
--- a/tools/printdata.py
+++ b/tools/printdata.py
@@ -0,0 +1,44 @@
 #!/usr/bin/python

 # Copyright (C) 2012 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
 # ucd-tools is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # ucd-tools is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

 import os
 import sys
 import ucd

 ucd_rootdir = sys.argv[1]

 unicode_chars = {}
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	for codepoint in data['CodePoint']:
 		unicode_chars[codepoint] = data

 null = ucd.CodePoint('0000')
 if __name__ == '__main__':
 	for codepoint in ucd.CodeRange('000000..10FFFF'):
 		try:
 			data = unicode_chars[codepoint]
 			title = data['TitleCase']
 			upper = data['UpperCase']
 			lower = data['LowerCase']
 			if title == null: title = codepoint
 			if upper == null: upper = codepoint
 			if lower == null: lower = codepoint
 			print '%s %s %s %s %s' % (codepoint, data['GeneralCategory'], upper, lower, title)
 		except KeyError:
 			print '%s Cn %s %s %s' % (codepoint, codepoint, codepoint, codepoint)
--- a/tools/ucd.py
+++ b/tools/ucd.py
@@ -33,6 +33,9 @@ class CodePoint:
 	def __str__(self):
 		return '%06X' % self.codepoint

 	def __iter__(self):
 		yield self

 	def __hash__(self):
 		return self.codepoint