123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- #!/usr/bin/python
-
- # Copyright (C) 2012-2016 Reece H. Dunn
- #
- # This file is part of ucd-tools.
- #
- # ucd-tools is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # ucd-tools is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
-
- import os
- import sys
- import ucd
-
- ucd_rootdir = sys.argv[1]
- ucd_version = sys.argv[2]
-
- unicode_chars = {}
- for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
- for codepoint in data['Range']:
- unicode_chars[codepoint] = data['Script']
- if '--with-csur' in sys.argv:
- for csur in ['Klingon']:
- for data in ucd.parse_ucd_data('data/csur', csur):
- for codepoint in data['CodePoint']:
- unicode_chars[codepoint] = data['Script']
-
- # This map is a combination of the information in the UnicodeData and Blocks
- # data files. It is intended to reduce the number of character tables that
- # need to be generated.
- script_sets = [
- (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
- (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
- (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
- (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
- ]
-
- # These scripts have many pages consisting of just this script:
- special_scripts = []
-
- script_tables = {}
- for codepoints, script, comment in script_sets:
- if not script:
- table = {}
- table_entry = None
- table_codepoint = None
- table_script = None
- for i, codepoint in enumerate(codepoints):
- try:
- script = unicode_chars[codepoint]
- except KeyError:
- script = 'Zzzz' # Unknown
- if (i % 256) == 0:
- if table_entry:
- if table_script in special_scripts:
- table[table_codepoint] = table_script
- elif table_script:
- special_scripts.append(table_script)
- table[table_codepoint] = table_script
- else:
- table[table_codepoint] = table_entry
- table_entry = []
- table_codepoint = codepoint
- table_script = script
- if script != table_script:
- table_script = None
- table_entry.append(script)
- if table_entry:
- if table_script in special_scripts:
- table[table_codepoint] = table_script
- else:
- table[table_codepoint] = table_entry
- script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
-
- if __name__ == '__main__':
- sys.stdout.write("""/* Unicode Scripts
- *
- * Copyright (C) 2012-2016 Reece H. Dunn
- *
- * This file is part of ucd-tools.
- *
- * ucd-tools is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ucd-tools is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
- */
-
- /* NOTE: This file is automatically generated from the Scripts.txt file in
- * the Unicode Character database by the ucd-tools/tools/scripts.py script.
- */
-
- #include "ucd/ucd.h"
-
- #include <stddef.h>
-
- #define Adlm UCD_SCRIPT_Adlm
- #define Afak UCD_SCRIPT_Afak
- #define Aghb UCD_SCRIPT_Aghb
- #define Ahom UCD_SCRIPT_Ahom
- #define Arab UCD_SCRIPT_Arab
- #define Armi UCD_SCRIPT_Armi
- #define Armn UCD_SCRIPT_Armn
- #define Avst UCD_SCRIPT_Avst
- #define Bali UCD_SCRIPT_Bali
- #define Bamu UCD_SCRIPT_Bamu
- #define Bass UCD_SCRIPT_Bass
- #define Batk UCD_SCRIPT_Batk
- #define Beng UCD_SCRIPT_Beng
- #define Bhks UCD_SCRIPT_Bhks
- #define Blis UCD_SCRIPT_Blis
- #define Bopo UCD_SCRIPT_Bopo
- #define Brah UCD_SCRIPT_Brah
- #define Brai UCD_SCRIPT_Brai
- #define Bugi UCD_SCRIPT_Bugi
- #define Buhd UCD_SCRIPT_Buhd
- #define Cakm UCD_SCRIPT_Cakm
- #define Cans UCD_SCRIPT_Cans
- #define Cari UCD_SCRIPT_Cari
- #define Cham UCD_SCRIPT_Cham
- #define Cher UCD_SCRIPT_Cher
- #define Cirt UCD_SCRIPT_Cirt
- #define Copt UCD_SCRIPT_Copt
- #define Cprt UCD_SCRIPT_Cprt
- #define Cyrl UCD_SCRIPT_Cyrl
- #define Cyrs UCD_SCRIPT_Cyrs
- #define Deva UCD_SCRIPT_Deva
- #define Dsrt UCD_SCRIPT_Dsrt
- #define Dupl UCD_SCRIPT_Dupl
- #define Egyd UCD_SCRIPT_Egyd
- #define Egyh UCD_SCRIPT_Egyh
- #define Egyp UCD_SCRIPT_Egyp
- #define Elba UCD_SCRIPT_Elba
- #define Ethi UCD_SCRIPT_Ethi
- #define Geok UCD_SCRIPT_Geok
- #define Geor UCD_SCRIPT_Geor
- #define Glag UCD_SCRIPT_Glag
- #define Goth UCD_SCRIPT_Goth
- #define Gran UCD_SCRIPT_Gran
- #define Grek UCD_SCRIPT_Grek
- #define Gujr UCD_SCRIPT_Gujr
- #define Guru UCD_SCRIPT_Guru
- #define Hang UCD_SCRIPT_Hang
- #define Hani UCD_SCRIPT_Hani
- #define Hano UCD_SCRIPT_Hano
- #define Hans UCD_SCRIPT_Hans
- #define Hant UCD_SCRIPT_Hant
- #define Hatr UCD_SCRIPT_Hatr
- #define Hebr UCD_SCRIPT_Hebr
- #define Hira UCD_SCRIPT_Hira
- #define Hluw UCD_SCRIPT_Hluw
- #define Hmng UCD_SCRIPT_Hmng
- #define Hrkt UCD_SCRIPT_Hrkt
- #define Hung UCD_SCRIPT_Hung
- #define Inds UCD_SCRIPT_Inds
- #define Ital UCD_SCRIPT_Ital
- #define Java UCD_SCRIPT_Java
- #define Jpan UCD_SCRIPT_Jpan
- #define Jurc UCD_SCRIPT_Jurc
- #define Kali UCD_SCRIPT_Kali
- #define Kana UCD_SCRIPT_Kana
- #define Khar UCD_SCRIPT_Khar
- #define Khmr UCD_SCRIPT_Khmr
- #define Khoj UCD_SCRIPT_Khoj
- #define Knda UCD_SCRIPT_Knda
- #define Kore UCD_SCRIPT_Kore
- #define Kpel UCD_SCRIPT_Kpel
- #define Kthi UCD_SCRIPT_Kthi
- #define Lana UCD_SCRIPT_Lana
- #define Laoo UCD_SCRIPT_Laoo
- #define Latf UCD_SCRIPT_Latf
- #define Latg UCD_SCRIPT_Latg
- #define Latn UCD_SCRIPT_Latn
- #define Lepc UCD_SCRIPT_Lepc
- #define Limb UCD_SCRIPT_Limb
- #define Lina UCD_SCRIPT_Lina
- #define Linb UCD_SCRIPT_Linb
- #define Lisu UCD_SCRIPT_Lisu
- #define Loma UCD_SCRIPT_Loma
- #define Lyci UCD_SCRIPT_Lyci
- #define Lydi UCD_SCRIPT_Lydi
- #define Mahj UCD_SCRIPT_Mahj
- #define Mand UCD_SCRIPT_Mand
- #define Mani UCD_SCRIPT_Mani
- #define Marc UCD_SCRIPT_Marc
- #define Maya UCD_SCRIPT_Maya
- #define Mend UCD_SCRIPT_Mend
- #define Merc UCD_SCRIPT_Merc
- #define Mero UCD_SCRIPT_Mero
- #define Mlym UCD_SCRIPT_Mlym
- #define Modi UCD_SCRIPT_Modi
- #define Mong UCD_SCRIPT_Mong
- #define Moon UCD_SCRIPT_Moon
- #define Mroo UCD_SCRIPT_Mroo
- #define Mtei UCD_SCRIPT_Mtei
- #define Mult UCD_SCRIPT_Mult
- #define Mymr UCD_SCRIPT_Mymr
- #define Narb UCD_SCRIPT_Narb
- #define Nbat UCD_SCRIPT_Nbat
- #define Newa UCD_SCRIPT_Newa
- #define Nkgb UCD_SCRIPT_Nkgb
- #define Nkoo UCD_SCRIPT_Nkoo
- #define Nshu UCD_SCRIPT_Nshu
- #define Ogam UCD_SCRIPT_Ogam
- #define Olck UCD_SCRIPT_Olck
- #define Orkh UCD_SCRIPT_Orkh
- #define Orya UCD_SCRIPT_Orya
- #define Osge UCD_SCRIPT_Osge
- #define Osma UCD_SCRIPT_Osma
- #define Palm UCD_SCRIPT_Palm
- #define Pauc UCD_SCRIPT_Pauc
- #define Perm UCD_SCRIPT_Perm
- #define Phag UCD_SCRIPT_Phag
- #define Phli UCD_SCRIPT_Phli
- #define Phlp UCD_SCRIPT_Phlp
- #define Phlv UCD_SCRIPT_Phlv
- #define Phnx UCD_SCRIPT_Phnx
- #define Plrd UCD_SCRIPT_Plrd
- #define Prti UCD_SCRIPT_Prti
- #define Qaak UCD_SCRIPT_Qaak
- #define Rjng UCD_SCRIPT_Rjng
- #define Roro UCD_SCRIPT_Roro
- #define Runr UCD_SCRIPT_Runr
- #define Samr UCD_SCRIPT_Samr
- #define Sara UCD_SCRIPT_Sara
- #define Sarb UCD_SCRIPT_Sarb
- #define Saur UCD_SCRIPT_Saur
- #define Sgnw UCD_SCRIPT_Sgnw
- #define Shaw UCD_SCRIPT_Shaw
- #define Shrd UCD_SCRIPT_Shrd
- #define Sidd UCD_SCRIPT_Sidd
- #define Sind UCD_SCRIPT_Sind
- #define Sinh UCD_SCRIPT_Sinh
- #define Sora UCD_SCRIPT_Sora
- #define Sund UCD_SCRIPT_Sund
- #define Sylo UCD_SCRIPT_Sylo
- #define Syrc UCD_SCRIPT_Syrc
- #define Syre UCD_SCRIPT_Syre
- #define Syrj UCD_SCRIPT_Syrj
- #define Syrn UCD_SCRIPT_Syrn
- #define Tagb UCD_SCRIPT_Tagb
- #define Takr UCD_SCRIPT_Takr
- #define Tale UCD_SCRIPT_Tale
- #define Talu UCD_SCRIPT_Talu
- #define Taml UCD_SCRIPT_Taml
- #define Tang UCD_SCRIPT_Tang
- #define Tavt UCD_SCRIPT_Tavt
- #define Telu UCD_SCRIPT_Telu
- #define Teng UCD_SCRIPT_Teng
- #define Tfng UCD_SCRIPT_Tfng
- #define Tglg UCD_SCRIPT_Tglg
- #define Thaa UCD_SCRIPT_Thaa
- #define Thai UCD_SCRIPT_Thai
- #define Tibt UCD_SCRIPT_Tibt
- #define Tirh UCD_SCRIPT_Tirh
- #define Ugar UCD_SCRIPT_Ugar
- #define Vaii UCD_SCRIPT_Vaii
- #define Visp UCD_SCRIPT_Visp
- #define Wara UCD_SCRIPT_Wara
- #define Wole UCD_SCRIPT_Wole
- #define Xpeo UCD_SCRIPT_Xpeo
- #define Xsux UCD_SCRIPT_Xsux
- #define Yiii UCD_SCRIPT_Yiii
- #define Zinh UCD_SCRIPT_Zinh
- #define Zmth UCD_SCRIPT_Zmth
- #define Zsym UCD_SCRIPT_Zsym
- #define Zxxx UCD_SCRIPT_Zxxx
- #define Zyyy UCD_SCRIPT_Zyyy
- #define Zzzz UCD_SCRIPT_Zzzz
-
- /* Unicode Character Data %s */
- """ % ucd_version)
-
- for script in special_scripts:
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
- sys.stdout.write('{')
- for i in range(0, 256):
- if (i % 16) == 0:
- sys.stdout.write('\n\t/* %02X */' % i)
- sys.stdout.write(' %s,' % script)
- sys.stdout.write('\n};\n')
-
- for codepoints, script, comment in script_sets:
- if not script:
- tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
- for codepoint in sorted(tables.keys()):
- table = tables[codepoint]
- if table in special_scripts:
- continue
-
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
- sys.stdout.write('{')
- for i, script in enumerate(table):
- if (i % 16) == 0:
- sys.stdout.write('\n\t/* %02X */' % i)
- sys.stdout.write(' %s,' % script)
- sys.stdout.write('\n};\n')
-
- for codepoints, script, comment in script_sets:
- if not script:
- table_index = '%s_%s' % (codepoints.first, codepoints.last)
- sys.stdout.write('\n')
- sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
- sys.stdout.write('{\n')
- for codepoint, table in sorted(script_tables[table_index].items()):
- if isinstance(table, str):
- sys.stdout.write('\tscripts_%s, /* %s */\n' % (table, codepoint))
- else:
- sys.stdout.write('\tscripts_%s,\n' % codepoint)
- sys.stdout.write('};\n')
-
- sys.stdout.write('\n')
- sys.stdout.write('ucd_script ucd_lookup_script(codepoint_t c)\n')
- sys.stdout.write('{\n')
- for codepoints, script, comment in script_sets:
- if script:
- sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, script, codepoints, comment))
- else:
- sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
- sys.stdout.write('\t{\n')
- sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
- sys.stdout.write('\t\treturn (ucd_script)table[c % 256];\n')
- sys.stdout.write('\t}\n')
- sys.stdout.write('\treturn Zzzz; /* Invalid Unicode Codepoint */\n')
- sys.stdout.write('}\n')
|