#!/usr/bin/python # Copyright (C) 2012-2017 Reece H. Dunn # # This file is part of ucd-tools. # # ucd-tools is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # ucd-tools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ucd-tools. If not, see . import os import sys import ucd ucd_rootdir = sys.argv[1] emoji_rootdir = 'data/emoji' csur_rootdir = 'data/csur' null = ucd.CodePoint('0000') properties = [ (ucd_rootdir, 'PropList'), (ucd_rootdir, 'DerivedCoreProperties'), (emoji_rootdir, 'emoji-data') ] unicode_chars = {} for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data for propdir, propfile in properties: for data in ucd.parse_ucd_data(propdir, propfile): for codepoint in data['Range']: try: unicode_chars[codepoint][data['Property']] = 1 except KeyError: unicode_chars[codepoint] = {'CodePoint': codepoint} unicode_chars[codepoint][data['Property']] = 1 for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'): for codepoint in data['Range']: unicode_chars[codepoint]['Script'] = data['Script'] if '--with-csur' in sys.argv: for csur in ['Klingon']: for data in ucd.parse_ucd_data('data/csur', csur): for codepoint in data['CodePoint']: unicode_chars[codepoint] = data def iscntrl(data): return 1 if data.get('Name', '') == '' else 0 def isdigit(data): return 1 if data['CodePoint'].char() in '0123456789' else 0 def isxdigit(data): return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0 def isspace(data): if data.get('White_Space', 0): dt = data.get('DecompositionType', '') return 1 if dt == None or not dt.startswith('') else 0 else: return 0 def isblank(data): # word separator if data.get('GeneralCategory', 'Cn') == 'Zs' or data['CodePoint'].char() == '\t': dt = data.get('DecompositionType', '') return 1 if dt == None or not dt.startswith('') else 0 else: return 0 def ispunct(data): return 1 if isgraph(data) and not isalnum(data) else 0 def isprint(data): if data.get('GeneralCategory', 'Cn')[0] in 'LMNPSZ': # not in 'CI' return 1 else: return 0 def isgraph(data): if data.get('GeneralCategory', 'Cn')[0] in 'LMNPS': # not in 'CZI' return 1 else: return 0 def isalnum(data): if data.get('GeneralCategory', 'Cn')[0] in 'N': return 1 else: return data.get('Alphabetic', 0) def isalpha(data): return data.get('Alphabetic', 0) def isupper(data): if data.get('Uppercase', 0): return 1 elif data.get('LowerCase', null) != null: # Some Lt characters have lowercase forms. return 1 else: return 0 def islower(data): if data.get('Lowercase', 0): return 1 elif data.get('UpperCase', null) != null: return 1 else: return 0 def decomposition_type(data, dtype): value = data.get('DecompositionType', None) if value and value.startswith(dtype): return value return None def properties(data): props = 0 props += (2 ** 0) * data.get('White_Space', 0) props += (2 ** 1) * data.get('Bidi_Control', 0) props += (2 ** 2) * data.get('Join_Control', 0) props += (2 ** 3) * data.get('Dash', 0) props += (2 ** 4) * data.get('Hyphen', 0) props += (2 ** 5) * data.get('Quotation_Mark', 0) props += (2 ** 6) * data.get('Terminal_Punctuation', 0) props += (2 ** 7) * data.get('Other_Math', 0) props += (2 ** 8) * data.get('Hex_Digit', 0) props += (2 ** 9) * data.get('ASCII_Hex_Digit', 0) props += (2 ** 10) * data.get('Other_Alphabetic', 0) props += (2 ** 11) * data.get('Ideographic', 0) props += (2 ** 12) * data.get('Diacritic', 0) props += (2 ** 13) * data.get('Extender', 0) props += (2 ** 14) * data.get('Other_Lowercase', 0) props += (2 ** 15) * data.get('Other_Uppercase', 0) props += (2 ** 16) * data.get('Noncharacter_Code_Point', 0) props += (2 ** 17) * data.get('Other_Grapheme_Extend', 0) props += (2 ** 18) * data.get('IDS_Binary_Operator', 0) props += (2 ** 19) * data.get('IDS_Trinary_Operator', 0) props += (2 ** 20) * data.get('Radical', 0) props += (2 ** 21) * data.get('Unified_Ideograph', 0) props += (2 ** 22) * data.get('Other_Default_Ignorable_Code_Point', 0) props += (2 ** 23) * data.get('Deprecated', 0) props += (2 ** 24) * data.get('Soft_Dotted', 0) props += (2 ** 25) * data.get('Logical_Order_Exception', 0) props += (2 ** 26) * data.get('Other_ID_Start', 0) props += (2 ** 27) * data.get('Other_ID_Continue', 0) props += (2 ** 28) * data.get('Sentence_Terminal', 0) props += (2 ** 29) * data.get('Variation_Selector', 0) props += (2 ** 30) * data.get('Pattern_White_Space', 0) props += (2 ** 31) * data.get('Pattern_Syntax', 0) props += (2 ** 32) * data.get('Prepended_Concatenation_Mark', 0) props += (2 ** 33) * data.get('Emoji', 0) # emoji-data props += (2 ** 34) * data.get('Emoji_Presentation', 0) # emoji-data props += (2 ** 35) * data.get('Emoji_Modifier', 0) # emoji-data props += (2 ** 36) * data.get('Emoji_Modifier_Base', 0) # emoji-data props += (2 ** 37) * data.get('Regional_Indicator', 0) # PropList 10.0.0 props += (2 ** 38) * data.get('Emoji_Component', 0) # emoji-data 5.0 return props if __name__ == '__main__': for codepoint in ucd.CodeRange('000000..10FFFF'): try: data = unicode_chars[codepoint] except KeyError: data = {'CodePoint': codepoint} script = data.get('Script', 'Zzzz') title = data.get('TitleCase', codepoint) upper = data.get('UpperCase', codepoint) lower = data.get('LowerCase', codepoint) if title == null: title = codepoint if upper == null: upper = codepoint if lower == null: lower = codepoint print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %016x' % ( codepoint, script, data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'), upper, lower, title, isdigit(data), isxdigit(data), iscntrl(data), isspace(data), isblank(data), ispunct(data), isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data), properties(data)))