#!/usr/bin/python
# Copyright (C) 2012-2016 Reece H. Dunn
#
# This file is part of ucd-tools.
#
# ucd-tools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ucd-tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools. If not, see .
import os
import sys
import ucd
ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]
unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
for codepoint in data['Range']:
unicode_chars[codepoint] = data['Script']
if '--with-csur' in sys.argv:
for csur in ['Klingon']:
for data in ucd.parse_ucd_data('data/csur', csur):
for codepoint in data['CodePoint']:
unicode_chars[codepoint] = data['Script']
# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
script_sets = [
(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
(ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
(ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
(ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
(ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
]
# These scripts have many pages consisting of just this script:
special_scripts = []
script_tables = {}
for codepoints, script, comment in script_sets:
if not script:
table = {}
table_entry = None
table_codepoint = None
table_script = None
for i, codepoint in enumerate(codepoints):
try:
script = unicode_chars[codepoint]
except KeyError:
script = 'Zzzz' # Unknown
if (i % 256) == 0:
if table_entry:
if table_script in special_scripts:
table[table_codepoint] = table_script
elif table_script:
special_scripts.append(table_script)
table[table_codepoint] = table_script
else:
table[table_codepoint] = table_entry
table_entry = []
table_codepoint = codepoint
table_script = script
if script != table_script:
table_script = None
table_entry.append(script)
if table_entry:
if table_script in special_scripts:
table[table_codepoint] = table_script
else:
table[table_codepoint] = table_entry
script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
if __name__ == '__main__':
sys.stdout.write("""/* Unicode Scripts
*
* Copyright (C) 2012-2016 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see .
*/
/* NOTE: This file is automatically generated from the Scripts.txt file in
* the Unicode Character database by the ucd-tools/tools/scripts.py script.
*/
#include "ucd/ucd.h"
#include
#define Adlm UCD_SCRIPT_Adlm
#define Afak UCD_SCRIPT_Afak
#define Aghb UCD_SCRIPT_Aghb
#define Ahom UCD_SCRIPT_Ahom
#define Arab UCD_SCRIPT_Arab
#define Armi UCD_SCRIPT_Armi
#define Armn UCD_SCRIPT_Armn
#define Avst UCD_SCRIPT_Avst
#define Bali UCD_SCRIPT_Bali
#define Bamu UCD_SCRIPT_Bamu
#define Bass UCD_SCRIPT_Bass
#define Batk UCD_SCRIPT_Batk
#define Beng UCD_SCRIPT_Beng
#define Bhks UCD_SCRIPT_Bhks
#define Blis UCD_SCRIPT_Blis
#define Bopo UCD_SCRIPT_Bopo
#define Brah UCD_SCRIPT_Brah
#define Brai UCD_SCRIPT_Brai
#define Bugi UCD_SCRIPT_Bugi
#define Buhd UCD_SCRIPT_Buhd
#define Cakm UCD_SCRIPT_Cakm
#define Cans UCD_SCRIPT_Cans
#define Cari UCD_SCRIPT_Cari
#define Cham UCD_SCRIPT_Cham
#define Cher UCD_SCRIPT_Cher
#define Cirt UCD_SCRIPT_Cirt
#define Copt UCD_SCRIPT_Copt
#define Cprt UCD_SCRIPT_Cprt
#define Cyrl UCD_SCRIPT_Cyrl
#define Cyrs UCD_SCRIPT_Cyrs
#define Deva UCD_SCRIPT_Deva
#define Dsrt UCD_SCRIPT_Dsrt
#define Dupl UCD_SCRIPT_Dupl
#define Egyd UCD_SCRIPT_Egyd
#define Egyh UCD_SCRIPT_Egyh
#define Egyp UCD_SCRIPT_Egyp
#define Elba UCD_SCRIPT_Elba
#define Ethi UCD_SCRIPT_Ethi
#define Geok UCD_SCRIPT_Geok
#define Geor UCD_SCRIPT_Geor
#define Glag UCD_SCRIPT_Glag
#define Gonm UCD_SCRIPT_Gonm
#define Goth UCD_SCRIPT_Goth
#define Gran UCD_SCRIPT_Gran
#define Grek UCD_SCRIPT_Grek
#define Gujr UCD_SCRIPT_Gujr
#define Guru UCD_SCRIPT_Guru
#define Hang UCD_SCRIPT_Hang
#define Hani UCD_SCRIPT_Hani
#define Hano UCD_SCRIPT_Hano
#define Hans UCD_SCRIPT_Hans
#define Hant UCD_SCRIPT_Hant
#define Hatr UCD_SCRIPT_Hatr
#define Hebr UCD_SCRIPT_Hebr
#define Hira UCD_SCRIPT_Hira
#define Hluw UCD_SCRIPT_Hluw
#define Hmng UCD_SCRIPT_Hmng
#define Hrkt UCD_SCRIPT_Hrkt
#define Hung UCD_SCRIPT_Hung
#define Inds UCD_SCRIPT_Inds
#define Ital UCD_SCRIPT_Ital
#define Java UCD_SCRIPT_Java
#define Jpan UCD_SCRIPT_Jpan
#define Jurc UCD_SCRIPT_Jurc
#define Kali UCD_SCRIPT_Kali
#define Kana UCD_SCRIPT_Kana
#define Khar UCD_SCRIPT_Khar
#define Khmr UCD_SCRIPT_Khmr
#define Khoj UCD_SCRIPT_Khoj
#define Knda UCD_SCRIPT_Knda
#define Kore UCD_SCRIPT_Kore
#define Kpel UCD_SCRIPT_Kpel
#define Kthi UCD_SCRIPT_Kthi
#define Lana UCD_SCRIPT_Lana
#define Laoo UCD_SCRIPT_Laoo
#define Latf UCD_SCRIPT_Latf
#define Latg UCD_SCRIPT_Latg
#define Latn UCD_SCRIPT_Latn
#define Lepc UCD_SCRIPT_Lepc
#define Limb UCD_SCRIPT_Limb
#define Lina UCD_SCRIPT_Lina
#define Linb UCD_SCRIPT_Linb
#define Lisu UCD_SCRIPT_Lisu
#define Loma UCD_SCRIPT_Loma
#define Lyci UCD_SCRIPT_Lyci
#define Lydi UCD_SCRIPT_Lydi
#define Mahj UCD_SCRIPT_Mahj
#define Mand UCD_SCRIPT_Mand
#define Mani UCD_SCRIPT_Mani
#define Marc UCD_SCRIPT_Marc
#define Maya UCD_SCRIPT_Maya
#define Mend UCD_SCRIPT_Mend
#define Merc UCD_SCRIPT_Merc
#define Mero UCD_SCRIPT_Mero
#define Mlym UCD_SCRIPT_Mlym
#define Modi UCD_SCRIPT_Modi
#define Mong UCD_SCRIPT_Mong
#define Moon UCD_SCRIPT_Moon
#define Mroo UCD_SCRIPT_Mroo
#define Mtei UCD_SCRIPT_Mtei
#define Mult UCD_SCRIPT_Mult
#define Mymr UCD_SCRIPT_Mymr
#define Narb UCD_SCRIPT_Narb
#define Nbat UCD_SCRIPT_Nbat
#define Newa UCD_SCRIPT_Newa
#define Nkgb UCD_SCRIPT_Nkgb
#define Nkoo UCD_SCRIPT_Nkoo
#define Nshu UCD_SCRIPT_Nshu
#define Ogam UCD_SCRIPT_Ogam
#define Olck UCD_SCRIPT_Olck
#define Orkh UCD_SCRIPT_Orkh
#define Orya UCD_SCRIPT_Orya
#define Osge UCD_SCRIPT_Osge
#define Osma UCD_SCRIPT_Osma
#define Palm UCD_SCRIPT_Palm
#define Pauc UCD_SCRIPT_Pauc
#define Perm UCD_SCRIPT_Perm
#define Phag UCD_SCRIPT_Phag
#define Phli UCD_SCRIPT_Phli
#define Phlp UCD_SCRIPT_Phlp
#define Phlv UCD_SCRIPT_Phlv
#define Phnx UCD_SCRIPT_Phnx
#define Plrd UCD_SCRIPT_Plrd
#define Prti UCD_SCRIPT_Prti
#define Qaak UCD_SCRIPT_Qaak
#define Rjng UCD_SCRIPT_Rjng
#define Roro UCD_SCRIPT_Roro
#define Runr UCD_SCRIPT_Runr
#define Samr UCD_SCRIPT_Samr
#define Sara UCD_SCRIPT_Sara
#define Sarb UCD_SCRIPT_Sarb
#define Saur UCD_SCRIPT_Saur
#define Sgnw UCD_SCRIPT_Sgnw
#define Shaw UCD_SCRIPT_Shaw
#define Shrd UCD_SCRIPT_Shrd
#define Sidd UCD_SCRIPT_Sidd
#define Sind UCD_SCRIPT_Sind
#define Sinh UCD_SCRIPT_Sinh
#define Sora UCD_SCRIPT_Sora
#define Soyo UCD_SCRIPT_Soyo
#define Sund UCD_SCRIPT_Sund
#define Sylo UCD_SCRIPT_Sylo
#define Syrc UCD_SCRIPT_Syrc
#define Syre UCD_SCRIPT_Syre
#define Syrj UCD_SCRIPT_Syrj
#define Syrn UCD_SCRIPT_Syrn
#define Tagb UCD_SCRIPT_Tagb
#define Takr UCD_SCRIPT_Takr
#define Tale UCD_SCRIPT_Tale
#define Talu UCD_SCRIPT_Talu
#define Taml UCD_SCRIPT_Taml
#define Tang UCD_SCRIPT_Tang
#define Tavt UCD_SCRIPT_Tavt
#define Telu UCD_SCRIPT_Telu
#define Teng UCD_SCRIPT_Teng
#define Tfng UCD_SCRIPT_Tfng
#define Tglg UCD_SCRIPT_Tglg
#define Thaa UCD_SCRIPT_Thaa
#define Thai UCD_SCRIPT_Thai
#define Tibt UCD_SCRIPT_Tibt
#define Tirh UCD_SCRIPT_Tirh
#define Ugar UCD_SCRIPT_Ugar
#define Vaii UCD_SCRIPT_Vaii
#define Visp UCD_SCRIPT_Visp
#define Wara UCD_SCRIPT_Wara
#define Wole UCD_SCRIPT_Wole
#define Xpeo UCD_SCRIPT_Xpeo
#define Xsux UCD_SCRIPT_Xsux
#define Yiii UCD_SCRIPT_Yiii
#define Zanb UCD_SCRIPT_Zanb
#define Zinh UCD_SCRIPT_Zinh
#define Zmth UCD_SCRIPT_Zmth
#define Zsym UCD_SCRIPT_Zsym
#define Zxxx UCD_SCRIPT_Zxxx
#define Zyyy UCD_SCRIPT_Zyyy
#define Zzzz UCD_SCRIPT_Zzzz
/* Unicode Character Data %s */
""" % ucd_version)
for script in special_scripts:
sys.stdout.write('\n')
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
sys.stdout.write('{')
for i in range(0, 256):
if (i % 16) == 0:
sys.stdout.write('\n\t/* %02X */' % i)
sys.stdout.write(' %s,' % script)
sys.stdout.write('\n};\n')
for codepoints, script, comment in script_sets:
if not script:
tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
for codepoint in sorted(tables.keys()):
table = tables[codepoint]
if table in special_scripts:
continue
sys.stdout.write('\n')
sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
sys.stdout.write('{')
for i, script in enumerate(table):
if (i % 16) == 0:
sys.stdout.write('\n\t/* %02X */' % i)
sys.stdout.write(' %s,' % script)
sys.stdout.write('\n};\n')
for codepoints, script, comment in script_sets:
if not script:
table_index = '%s_%s' % (codepoints.first, codepoints.last)
sys.stdout.write('\n')
sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
sys.stdout.write('{\n')
for codepoint, table in sorted(script_tables[table_index].items()):
if isinstance(table, str):
sys.stdout.write('\tscripts_%s, /* %s */\n' % (table, codepoint))
else:
sys.stdout.write('\tscripts_%s,\n' % codepoint)
sys.stdout.write('};\n')
sys.stdout.write('\n')
sys.stdout.write('ucd_script ucd_lookup_script(codepoint_t c)\n')
sys.stdout.write('{\n')
for codepoints, script, comment in script_sets:
if script:
sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, script, codepoints, comment))
else:
sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
sys.stdout.write('\t{\n')
sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
sys.stdout.write('\t\treturn (ucd_script)table[c % 256];\n')
sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Zzzz; /* Invalid Unicode Codepoint */\n')
sys.stdout.write('}\n')