Browse Source

Use PropertyValueAliases for the script mapping.

The mapping of the script labels in the UCD data to ISO 15924
script tags is now done using the sc property map in the
PropertyValueAliases data.

This has the following benefits:

    1.  It removes the dependency on the IANA subtag registry.

    2.  It ensures the scripts are correct as specified in the
        UCD data files.
master
Reece H. Dunn 10 years ago
parent
commit
1154409393
4 changed files with 479 additions and 489 deletions
  1. 0
    1
      .gitignore
  2. 5
    4
      Makefile.am
  3. 460
    460
      src/scripts.cpp
  4. 14
    24
      tools/ucd.py

+ 0
- 1
.gitignore View File



# build output: # build output:


data/language-subtag-registry
data/ucd data/ucd


src/libucd.la src/libucd.la

+ 5
- 4
Makefile.am View File

UCD_VERSION=@UCD_VERSION@ UCD_VERSION=@UCD_VERSION@
UCD_ROOTDIR=data/ucd UCD_ROOTDIR=data/ucd


data/language-subtag-registry:
wget -O $@ http://www.iana.org/assignments/language-subtag-registry

data/ucd/PropList.txt: data/ucd/PropList.txt:
mkdir -pv data/ucd mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt


data/ucd/PropertyValueAliases.txt:
mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropertyValueAliases.txt

data/ucd/Scripts.txt: data/ucd/Scripts.txt:
mkdir -pv data/ucd mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt


############################# libucd ########################################## ############################# libucd ##########################################


tools/ucd.py: data/language-subtag-registry
tools/ucd.py: data/ucd/PropertyValueAliases.txt


tools/case.py: tools/ucd.py \ tools/case.py: tools/ucd.py \
data/ucd/UnicodeData.txt data/ucd/UnicodeData.txt

+ 460
- 460
src/scripts.cpp
File diff suppressed because it is too large
View File


+ 14
- 24
tools/ucd.py View File

import sys import sys
import iana import iana


script_map = {
# UCD script names not derivable from IANA script tags:
'Canadian_Aboriginal': 'Cans',
'Common': 'Zyyy',
'Egyptian_Hieroglyphs': 'Egyp',
'Inherited': 'Zyyy',
'Meetei_Mayek': 'Mtei',
'Nko': 'Nkoo',
'Phags_Pa': 'Phag',
# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
'Cuneiform': 'Xsux',
'Duployan': 'Dupl',
}

for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
if tag['Type'] == 'Script':
# Convert the IANA scipt tag descriptions to the UCD script names:
desc = tag['Description']
if ' (' in desc:
desc = desc.split(' (')[0]
desc = desc.replace(' ', '_')
script_map[desc] = ref
# Fix up incorrectly mapped script names:
script_map['Cyrillic'] = 'Cyrl'
script_map = {}


class CodePoint: class CodePoint:
def __init__(self, x): def __init__(self, x):
data[key], linedata = typemap(linedata) data[key], linedata = typemap(linedata)
yield data yield data


def parse_property_mapping(ucd_rootdir, propname, reverse=False):
ret = {}
for data in parse_ucd_data(ucd_rootdir, 'PropertyValueAliases'):
if data['Property'] == propname:
if reverse:
ret[data['Value']] = data['Key']
else:
ret[data['Key']] = data['Value']
return ret

if __name__ == '__main__': if __name__ == '__main__':
try: try:
items = sys.argv[3].split(',') items = sys.argv[3].split(',')
except: except:
items = None items = None
script_map = parse_property_mapping(sys.argv[1], 'sc', reverse=True)
for entry in parse_ucd_data(sys.argv[1], sys.argv[2]): for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
if items: if items:
print ','.join([str(entry[item]) for item in items]) print ','.join([str(entry[item]) for item in items])
else: else:
print entry print entry
else:
script_map = parse_property_mapping('data/ucd', 'sc', reverse=True)

Loading…
Cancel
Save