12 years ago · 9c3a87dbeb
--- a/Makefile.am
+++ b/Makefile.am
@@ -54,6 +54,9 @@ EXTRA_DIST += ChangeLog

 ############################# Unicode Character Database ######################

 src/case.cpp: tools/case.py tools/ucd.py
 	tools/case.py ${UCD_ROOTDIR} > $@

 src/categories.cpp: tools/categories.py tools/ucd.py
 	tools/categories.py ${UCD_ROOTDIR} > $@

@@ -68,5 +71,6 @@ lib_LTLIBRARIES += src/libucd.la
 src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION)
 src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS}
 src_libucd_la_SOURCES = \
 	src/case.cpp \
 	src/categories.cpp \
 	src/ctype.cpp
--- a/src/case.cpp
+++ b/src/case.cpp
--- a/src/include/ucd/ucd.h
+++ b/src/include/ucd/ucd.h
@@ -31,6 +31,11 @@ namespace ucd
 	  */
 	typedef uint32_t codepoint_t;

 	/** @name  Unicode General Category
 	  * @brief These functions query the General Category property of Unicode codepoints.
 	  */
 	//@{

 	/** @brief Unicode General Category Groups
 	  * @see   http://www.unicode.org/reports/tr44/
 	  */
@@ -106,6 +111,7 @@ namespace ucd
 	category lookup_category(codepoint_t c);


 	//@}
 	/** @name  ctype-style APIs
 	  * @brief These functions provide wctype compatible functions using the UCD data.
 	  */
@@ -183,6 +189,50 @@ namespace ucd
 	int isupper(codepoint_t c);


 	//@}
 	/** @name  Case Conversion APIs
 	  * @brief These functions convert Unicode codepoints between lower, upper and title case.
 	  */
 	//@{


 	/** @brief Convert the Unicode codepoint to upper-case.
 	  *
 	  * This function only uses the simple case mapping present in the
 	  * UnicodeData file. The data in SpecialCasing requires Unicode
 	  * codepoints to be mapped to multiple codepoints.
 	  *
 	  * @param c The Unicode codepoint to convert.
 	  * @return  The upper-case Unicode codepoint for this codepoint, or
 	  *          this codepoint if there is no upper-case codepoint.
 	  */
 	codepoint_t toupper(codepoint_t c);

 	/** @brief Convert the Unicode codepoint to lower-case.
 	  *
 	  * This function only uses the simple case mapping present in the
 	  * UnicodeData file. The data in SpecialCasing requires Unicode
 	  * codepoints to be mapped to multiple codepoints.
 	  *
 	  * @param c The Unicode codepoint to convert.
 	  * @return  The lower-case Unicode codepoint for this codepoint, or
 	  *          this codepoint if there is no upper-case codepoint.
 	  */
 	codepoint_t tolower(codepoint_t c);

 	/** @brief Convert the Unicode codepoint to title-case.
 	  *
 	  * This function only uses the simple case mapping present in the
 	  * UnicodeData file. The data in SpecialCasing requires Unicode
 	  * codepoints to be mapped to multiple codepoints.
 	  *
 	  * @param c The Unicode codepoint to convert.
 	  * @return  The title-case Unicode codepoint for this codepoint, or
 	  *          this codepoint if there is no upper-case codepoint.
 	  */
 	codepoint_t totitle(codepoint_t c);


 	//@}
 }

--- a/tools/case.py
+++ b/tools/case.py
@@ -0,0 +1,100 @@
 #!/usr/bin/python

 # Copyright (C) 2012 Reece H. Dunn
 #
 # This file is part of ucd-tools.
 #
 # ucd-tools is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # ucd-tools is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

 import os
 import sys
 import ucd

 ucd_rootdir = sys.argv[1]
 ucd_version = ucd_rootdir.split('-')[-1]

 unicode_chars = {}
 null = ucd.CodePoint('0000')
 for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
 	if data['LowerCase'] != null or data['UpperCase'] != null or data['TitleCase'] != null:
 		unicode_chars[data['CodePoint']] = (data['LowerCase'], data['UpperCase'], data['TitleCase'])

 if __name__ == '__main__':
 	sys.stdout.write("""/* Unicode Case Conversion
 *
 * Copyright (C) 2012 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ucd-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.
 */

 // NOTE: This file is automatically generated from the UnicodeData.txt file in
 // the Unicode Character database by the ucd-tools/tools/categories.py script.

 #include "ucd/ucd.h"

 #include <stddef.h>

 using namespace ucd;

 // Unicode Character Data %s

 struct case_conversion_entry
 {
 	codepoint_t codepoint;
 	codepoint_t uppercase;
 	codepoint_t lowercase;
 	codepoint_t titlecase;
 };
 """ % ucd_version)

 	sys.stdout.write('\n')
 	sys.stdout.write('static const case_conversion_entry case_conversion_data[] =\n')
 	sys.stdout.write('{\n')
 	for codepoint in sorted(unicode_chars.keys()):
 		lower, upper, title = unicode_chars[codepoint]
 		sys.stdout.write('\t{ 0x%s, 0x%s, 0x%s, 0x%s },\n' % (codepoint, upper, lower, title))
 	sys.stdout.write('};\n')

 	for case in ['upper', 'lower', 'title']:
 		sys.stdout.write('\n')
 		sys.stdout.write('ucd::codepoint_t ucd::to%s(codepoint_t c)\n' % case)
 		sys.stdout.write('{\n')
 		sys.stdout.write('\tint begin = 0;\n')
 		sys.stdout.write('\tint end   = sizeof(case_conversion_data)/sizeof(case_conversion_data[0]);\n')
 		sys.stdout.write('\twhile (begin <= end)\n')
 		sys.stdout.write('\t{\n')
 		sys.stdout.write('\t\tint pos = (begin + end) / 2;\n')
 		sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n')
 		sys.stdout.write('\t\tif (c == item->codepoint)\n')
 		sys.stdout.write('\t\t\treturn item->%scase;\n' % case)
 		sys.stdout.write('\t\telse if (c > item->codepoint)\n')
 		sys.stdout.write('\t\t\tbegin = pos + 1;\n')
 		sys.stdout.write('\t\telse\n')
 		sys.stdout.write('\t\t\tend = pos - 1;\n')
 		sys.stdout.write('\t}\n')
 		sys.stdout.write('\treturn c;\n')
 		sys.stdout.write('}\n')
--- a/tools/ucd.py
+++ b/tools/ucd.py
@@ -70,7 +70,7 @@ def codepoint(x):
 	if ' ' in x:
 		return [CodePoint(c) for c in x.split()]
 	if x == '':
 		return None
 		return CodePoint('0000')
 	return CodePoint(x)

 def string(x):