Browse Source

Use PropertyValueAliases for the script mapping.

The mapping of the script labels in the UCD data to ISO 15924
script tags is now done using the sc property map in the
PropertyValueAliases data.

This has the following benefits:

    1.  It removes the dependency on the IANA subtag registry.

    2.  It ensures the scripts are correct as specified in the
        UCD data files.
master
Reece H. Dunn 10 years ago
parent
commit
1154409393
4 changed files with 479 additions and 489 deletions
  1. 0
    1
      .gitignore
  2. 5
    4
      Makefile.am
  3. 460
    460
      src/scripts.cpp
  4. 14
    24
      tools/ucd.py

+ 0
- 1
.gitignore View File

@@ -12,7 +12,6 @@

# build output:

data/language-subtag-registry
data/ucd

src/libucd.la

+ 5
- 4
Makefile.am View File

@@ -57,13 +57,14 @@ EXTRA_DIST += ChangeLog
UCD_VERSION=@UCD_VERSION@
UCD_ROOTDIR=data/ucd

data/language-subtag-registry:
wget -O $@ http://www.iana.org/assignments/language-subtag-registry

data/ucd/PropList.txt:
mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropList.txt

data/ucd/PropertyValueAliases.txt:
mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/PropertyValueAliases.txt

data/ucd/Scripts.txt:
mkdir -pv data/ucd
wget -O $@ http://www.unicode.org/Public/${UCD_VERSION}/ucd/Scripts.txt
@@ -79,7 +80,7 @@ html:

############################# libucd ##########################################

tools/ucd.py: data/language-subtag-registry
tools/ucd.py: data/ucd/PropertyValueAliases.txt

tools/case.py: tools/ucd.py \
data/ucd/UnicodeData.txt

+ 460
- 460
src/scripts.cpp
File diff suppressed because it is too large
View File


+ 14
- 24
tools/ucd.py View File

@@ -21,30 +21,7 @@ import os
import sys
import iana

script_map = {
# UCD script names not derivable from IANA script tags:
'Canadian_Aboriginal': 'Cans',
'Common': 'Zyyy',
'Egyptian_Hieroglyphs': 'Egyp',
'Inherited': 'Zyyy',
'Meetei_Mayek': 'Mtei',
'Nko': 'Nkoo',
'Phags_Pa': 'Phag',
# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
'Cuneiform': 'Xsux',
'Duployan': 'Dupl',
}

for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
if tag['Type'] == 'Script':
# Convert the IANA scipt tag descriptions to the UCD script names:
desc = tag['Description']
if ' (' in desc:
desc = desc.split(' (')[0]
desc = desc.replace(' ', '_')
script_map[desc] = ref
# Fix up incorrectly mapped script names:
script_map['Cyrillic'] = 'Cyrl'
script_map = {}

class CodePoint:
def __init__(self, x):
@@ -193,13 +170,26 @@ def parse_ucd_data(ucd_rootdir, dataset):
data[key], linedata = typemap(linedata)
yield data

def parse_property_mapping(ucd_rootdir, propname, reverse=False):
ret = {}
for data in parse_ucd_data(ucd_rootdir, 'PropertyValueAliases'):
if data['Property'] == propname:
if reverse:
ret[data['Value']] = data['Key']
else:
ret[data['Key']] = data['Value']
return ret

if __name__ == '__main__':
try:
items = sys.argv[3].split(',')
except:
items = None
script_map = parse_property_mapping(sys.argv[1], 'sc', reverse=True)
for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
if items:
print ','.join([str(entry[item]) for item in items])
else:
print entry
else:
script_map = parse_property_mapping('data/ucd', 'sc', reverse=True)

Loading…
Cancel
Save