Browse Source

Add toupper, tolower and totitle case-conversion APIs.

master
Reece H. Dunn 12 years ago
parent
commit
9c3a87dbeb
5 changed files with 2341 additions and 1 deletions
  1. 4
    0
      Makefile.am
  2. 2186
    0
      src/case.cpp
  3. 50
    0
      src/include/ucd/ucd.h
  4. 100
    0
      tools/case.py
  5. 1
    1
      tools/ucd.py

+ 4
- 0
Makefile.am View File

@@ -54,6 +54,9 @@ EXTRA_DIST += ChangeLog

############################# Unicode Character Database ######################

src/case.cpp: tools/case.py tools/ucd.py
tools/case.py ${UCD_ROOTDIR} > $@

src/categories.cpp: tools/categories.py tools/ucd.py
tools/categories.py ${UCD_ROOTDIR} > $@

@@ -68,5 +71,6 @@ lib_LTLIBRARIES += src/libucd.la
src_libucd_la_LDFLAGS = -version-info $(LIBUCD_VERSION)
src_libucd_la_CXXFLAGS = ${AM_CXXFLAGS}
src_libucd_la_SOURCES = \
src/case.cpp \
src/categories.cpp \
src/ctype.cpp

+ 2186
- 0
src/case.cpp
File diff suppressed because it is too large
View File


+ 50
- 0
src/include/ucd/ucd.h View File

@@ -31,6 +31,11 @@ namespace ucd
*/
typedef uint32_t codepoint_t;

/** @name Unicode General Category
* @brief These functions query the General Category property of Unicode codepoints.
*/
//@{

/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
@@ -106,6 +111,7 @@ namespace ucd
category lookup_category(codepoint_t c);


//@}
/** @name ctype-style APIs
* @brief These functions provide wctype compatible functions using the UCD data.
*/
@@ -183,6 +189,50 @@ namespace ucd
int isupper(codepoint_t c);


//@}
/** @name Case Conversion APIs
* @brief These functions convert Unicode codepoints between lower, upper and title case.
*/
//@{


/** @brief Convert the Unicode codepoint to upper-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The upper-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t toupper(codepoint_t c);

/** @brief Convert the Unicode codepoint to lower-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The lower-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t tolower(codepoint_t c);

/** @brief Convert the Unicode codepoint to title-case.
*
* This function only uses the simple case mapping present in the
* UnicodeData file. The data in SpecialCasing requires Unicode
* codepoints to be mapped to multiple codepoints.
*
* @param c The Unicode codepoint to convert.
* @return The title-case Unicode codepoint for this codepoint, or
* this codepoint if there is no upper-case codepoint.
*/
codepoint_t totitle(codepoint_t c);


//@}
}


+ 100
- 0
tools/case.py View File

@@ -0,0 +1,100 @@
#!/usr/bin/python

# Copyright (C) 2012 Reece H. Dunn
#
# This file is part of ucd-tools.
#
# ucd-tools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ucd-tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = ucd_rootdir.split('-')[-1]

unicode_chars = {}
null = ucd.CodePoint('0000')
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
if data['LowerCase'] != null or data['UpperCase'] != null or data['TitleCase'] != null:
unicode_chars[data['CodePoint']] = (data['LowerCase'], data['UpperCase'], data['TitleCase'])

if __name__ == '__main__':
sys.stdout.write("""/* Unicode Case Conversion
*
* Copyright (C) 2012 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/

// NOTE: This file is automatically generated from the UnicodeData.txt file in
// the Unicode Character database by the ucd-tools/tools/categories.py script.

#include "ucd/ucd.h"

#include <stddef.h>

using namespace ucd;

// Unicode Character Data %s

struct case_conversion_entry
{
codepoint_t codepoint;
codepoint_t uppercase;
codepoint_t lowercase;
codepoint_t titlecase;
};
""" % ucd_version)

sys.stdout.write('\n')
sys.stdout.write('static const case_conversion_entry case_conversion_data[] =\n')
sys.stdout.write('{\n')
for codepoint in sorted(unicode_chars.keys()):
lower, upper, title = unicode_chars[codepoint]
sys.stdout.write('\t{ 0x%s, 0x%s, 0x%s, 0x%s },\n' % (codepoint, upper, lower, title))
sys.stdout.write('};\n')

for case in ['upper', 'lower', 'title']:
sys.stdout.write('\n')
sys.stdout.write('ucd::codepoint_t ucd::to%s(codepoint_t c)\n' % case)
sys.stdout.write('{\n')
sys.stdout.write('\tint begin = 0;\n')
sys.stdout.write('\tint end = sizeof(case_conversion_data)/sizeof(case_conversion_data[0]);\n')
sys.stdout.write('\twhile (begin <= end)\n')
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tint pos = (begin + end) / 2;\n')
sys.stdout.write('\t\tconst case_conversion_entry *item = (case_conversion_data + pos);\n')
sys.stdout.write('\t\tif (c == item->codepoint)\n')
sys.stdout.write('\t\t\treturn item->%scase;\n' % case)
sys.stdout.write('\t\telse if (c > item->codepoint)\n')
sys.stdout.write('\t\t\tbegin = pos + 1;\n')
sys.stdout.write('\t\telse\n')
sys.stdout.write('\t\t\tend = pos - 1;\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn c;\n')
sys.stdout.write('}\n')

+ 1
- 1
tools/ucd.py View File

@@ -70,7 +70,7 @@ def codepoint(x):
if ' ' in x:
return [CodePoint(c) for c in x.split()]
if x == '':
return None
return CodePoint('0000')
return CodePoint(x)

def string(x):

Loading…
Cancel
Save