123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- #!/usr/bin/python
-
- # Copyright (C) 2012-2016 Reece H. Dunn
- #
- # This file is part of ucd-tools.
- #
- # ucd-tools is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # ucd-tools is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
-
- import os
- import sys
- import ucd
-
- ucd_rootdir = sys.argv[1]
- ucd_version = sys.argv[2]
-
- unicode_chars = {}
- for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
- for codepoint in data['CodePoint']:
- unicode_chars[codepoint] = data['GeneralCategory']
- if '--with-csur' in sys.argv:
- for csur in ['Klingon']:
- for data in ucd.parse_ucd_data('data/csur', csur):
- for codepoint in data['CodePoint']:
- unicode_chars[codepoint] = data['GeneralCategory']
-
- # This map is a combination of the information in the UnicodeData and Blocks
- # data files. It is intended to reduce the number of character tables that
- # need to be generated.
- category_sets = [
- (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
- (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),
- (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
- (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
- (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
- (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
- (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
- (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
- ]
-
- # These categories have many pages consisting of just this category:
- # Cn -- Unassigned
- # Lo -- CJK Ideographs
- special_categories = ['Cn', 'Co', 'Lo', 'Sm', 'So']
-
- category_tables = {}
- for codepoints, category, comment in category_sets:
- if not category:
- table = {}
- table_entry = None
- table_codepoint = None
- table_category = None
- for i, codepoint in enumerate(codepoints):
- try:
- category = unicode_chars[codepoint]
- except KeyError:
- category = 'Cn' # Unassigned
- if (i % 256) == 0:
- if table_entry:
- if table_category in special_categories:
- table[table_codepoint] = table_category
- elif table_category:
- raise Exception('%s only table not in the special_categories list.' % table_category)
- else:
- table[table_codepoint] = table_entry
- table_entry = []
- table_codepoint = codepoint
- table_category = category
- if category != table_category:
- table_category = None
- table_entry.append(category)
- if table_entry:
- if table_category in special_categories:
- table[table_codepoint] = table_category
- else:
- table[table_codepoint] = table_entry
- category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
-
- if __name__ == '__main__':
- sys.stdout.write("""/* Unicode General Categories
- *
- * Copyright (C) 2012-2016 Reece H. Dunn
- *
- * This file is part of ucd-tools.
- *
- * ucd-tools is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ucd-tools is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
- */
-
- /* NOTE: This file is automatically generated from the UnicodeData.txt file in
- * the Unicode Character database by the ucd-tools/tools/categories.py script.
- */
-
- #include "ucd/ucd.h"
-
- #include <stddef.h>
-
- #define Cc UCD_CATEGORY_Cc
- #define Cf UCD_CATEGORY_Cf
- #define Cn UCD_CATEGORY_Cn
- #define Co UCD_CATEGORY_Co
- #define Cs UCD_CATEGORY_Cs
- #define Ll UCD_CATEGORY_Ll
- #define Lm UCD_CATEGORY_Lm
- #define Lo UCD_CATEGORY_Lo
- #define Lt UCD_CATEGORY_Lt
- #define Lu UCD_CATEGORY_Lu
- #define Mc UCD_CATEGORY_Mc
- #define Me UCD_CATEGORY_Me
- #define Mn UCD_CATEGORY_Mn
- #define Nd UCD_CATEGORY_Nd
- #define Nl UCD_CATEGORY_Nl
- #define No UCD_CATEGORY_No
- #define Pc UCD_CATEGORY_Pc
- #define Pd UCD_CATEGORY_Pd
- #define Pe UCD_CATEGORY_Pe
- #define Pf UCD_CATEGORY_Pf
- #define Pi UCD_CATEGORY_Pi
- #define Po UCD_CATEGORY_Po
- #define Ps UCD_CATEGORY_Ps
- #define Sc UCD_CATEGORY_Sc
- #define Sk UCD_CATEGORY_Sk
- #define Sm UCD_CATEGORY_Sm
- #define So UCD_CATEGORY_So
- #define Zl UCD_CATEGORY_Zl
- #define Zp UCD_CATEGORY_Zp
- #define Zs UCD_CATEGORY_Zs
- #define Ii UCD_CATEGORY_Ii
-
- /* Unicode Character Data %s */
- """ % ucd_version)
-
- for category in special_categories:
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t categories_%s[256] =\n' % category)
- sys.stdout.write('{')
- for i in range(0, 256):
- if (i % 16) == 0:
- sys.stdout.write('\n\t/* %02X */' % i)
- sys.stdout.write(' %s,' % category)
- sys.stdout.write('\n};\n')
-
- for codepoints, category, comment in category_sets:
- if not category:
- tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
- for codepoint in sorted(tables.keys()):
- table = tables[codepoint]
- if table in special_categories:
- continue
-
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t categories_%s[256] =\n' % codepoint)
- sys.stdout.write('{')
- for i, category in enumerate(table):
- if (i % 16) == 0:
- sys.stdout.write('\n\t/* %02X */' % i)
- sys.stdout.write(' %s,' % category)
- sys.stdout.write('\n};\n')
-
- for codepoints, category, comment in category_sets:
- if not category:
- table_index = '%s_%s' % (codepoints.first, codepoints.last)
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t *categories_%s[] =\n' % table_index)
- sys.stdout.write('{\n')
- for codepoint, table in sorted(category_tables[table_index].items()):
- if isinstance(table, str):
- sys.stdout.write('\tcategories_%s, /* %s */\n' % (table, codepoint))
- else:
- sys.stdout.write('\tcategories_%s,\n' % codepoint)
- sys.stdout.write('};\n')
-
- sys.stdout.write('\n')
- sys.stdout.write('ucd_category ucd_lookup_category(codepoint_t c)\n')
- sys.stdout.write('{\n')
- for codepoints, category, comment in category_sets:
- if category:
- sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, category, codepoints, comment))
- else:
- sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
- sys.stdout.write('\t{\n')
- sys.stdout.write('\t\tconst uint8_t *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
- sys.stdout.write('\t\treturn (ucd_category)table[c % 256];\n')
- sys.stdout.write('\t}\n')
- sys.stdout.write('\treturn Ii; /* Invalid Unicode Codepoint */\n')
- sys.stdout.write('}\n')
-
- sys.stdout.write("""
- ucd_category_group ucd_get_category_group_for_category(ucd_category c)
- {
- switch (c)
- {
- case Cc: case Cf: case Cn: case Co: case Cs:
- return UCD_CATEGORY_GROUP_C;
- case Ll: case Lm: case Lo: case Lt: case Lu:
- return UCD_CATEGORY_GROUP_L;
- case Mc: case Me: case Mn:
- return UCD_CATEGORY_GROUP_M;
- case Nd: case Nl: case No:
- return UCD_CATEGORY_GROUP_N;
- case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
- return UCD_CATEGORY_GROUP_P;
- case Sc: case Sk: case Sm: case So:
- return UCD_CATEGORY_GROUP_S;
- case Zl: case Zp: case Zs:
- return UCD_CATEGORY_GROUP_Z;
- case Ii:
- default:
- return UCD_CATEGORY_GROUP_I;
- }
- }
-
- ucd_category_group ucd_lookup_category_group(codepoint_t c)
- {
- return (ucd_category_group)ucd_get_category_group_for_category(ucd_lookup_category(c));
- }
- """)
|