mahta.fetrat
/
HomoFast-eSpeak-Persian


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
							#!/usr/bin/python

# Copyright (C) 2012 Reece H. Dunn
#
# This file is part of ucd-tools.
#
# ucd-tools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ucd-tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import iana

script_map = {
	# UCD script names not derivable from IANA script tags:
	'Canadian_Aboriginal': 'Cans',
	'Common': 'Zyyy',
	'Egyptian_Hieroglyphs': 'Egyp',
	'Inherited': 'Zyyy',
	'Meetei_Mayek': 'Mtei',
	'Nko': 'Nkoo',
	'Phags_Pa': 'Phag',
	# Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
	'Cuneiform': 'Xsux',
	'Duployan': 'Dupl',
}

for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
	if tag['Type'] == 'Script':
		# Convert the IANA scipt tag descriptions to the UCD script names:
		desc = tag['Description']
		if ' (' in desc:
			desc = desc.split(' (')[0]
		desc = desc.replace(' ', '_')
		script_map[desc] = ref
# Fix up incorrectly mapped script names:
script_map['Cyrillic'] = 'Cyrl'

class CodePoint:
	def __init__(self, x):
		if isinstance(x, str):
			self.codepoint = int(x, 16)
		else:
			self.codepoint = x

	def __repr__(self):
		return '%06X' % self.codepoint

	def __str__(self):
		return '%06X' % self.codepoint

	def __iter__(self):
		yield self

	def __hash__(self):
		return self.codepoint

	def __eq__(self, other):
		return self.codepoint == other.codepoint

	def __ne__(self, other):
		return self.codepoint != other.codepoint

	def __lt__(self, other):
		return self.codepoint < other.codepoint

class CodeRange:
	def __init__(self, x):
		f, l = x.split('..')
		self.first = CodePoint(f)
		self.last  = CodePoint(l)

	def __repr__(self):
		return '%s..%s' % (self.first, self.last)

	def __str__(self):
		return '%s..%s' % (self.first, self.last)

	def __iter__(self):
		for c in range(self.first.codepoint, self.last.codepoint + 1):
			yield CodePoint(c)

	def size(self):
		return self.last.codepoint - self.first.codepoint + 1

def codepoint(x):
	if '..' in x[0]:
		return CodeRange(x[0]), x[1:]
	if ' ' in x:
		return [CodePoint(c) for c in x[0].split()], x[1:]
	if x[0] == '':
		return CodePoint('0000'), x[1:]
	return CodePoint(x[0]), x[1:]

def string(x):
	if x[0] == '':
		return None, x[1:]
	return x[0], x[1:]

def integer(x):
	return int(x[0]), x[1:]

def boolean(x):
	if x[0] == 'Y':
		return True, x[1:]
	return False, x[1:]

def script(x):
	return script_map[x[0]], x[1:]

def strlist(x):
	return x, []

data_items = {
	'Blocks': [
		('Range', codepoint),
		('Name', string)
	],
	'DerivedAge': [
		('Range', codepoint),
		('Age', string),
	],
	'PropList': [
		('Range', codepoint),
		('Property', string),
	],
	'PropertyValueAliases': [
		('Property', string),
		('Key', string),
		('Value', string),
		('Aliases', strlist),
	],
	'Scripts': [
		('Range', codepoint),
		('Script', script),
	],
	'UnicodeData': [
		('CodePoint', codepoint),
		('Name', string),
		('GeneralCategory', string),
		('CanonicalCombiningClass', integer),
		('BidiClass', string),
		('DecompositionType', string),
		('DecompositionMapping', string),
		('NumericType', string),
		('NumericValue', string),
		('BidiMirrored', boolean),
		('UnicodeName', string),
		('ISOComment', string),
		('UpperCase', codepoint),
		('LowerCase', codepoint),
		('TitleCase', codepoint),
	],
	# Supplemental Data:
	'Klingon': [
		('CodePoint', codepoint),
		('Script', string),
		('GeneralCategory', string),
		('Name', string),
		('Transliteration', string),
	],
}

def parse_ucd_data(ucd_rootdir, dataset):
	keys  = data_items[dataset]
	first = None
	with open(os.path.join(ucd_rootdir, '%s.txt' % dataset)) as f:
		for line in f:
			line = line.replace('\n', '').split('#')[0]
			linedata = [' '.join(x.split()) for x in line.split(';')]
			if len(linedata) > 1:
				if linedata[1].endswith(', First>'):
					first = linedata
					continue

				if linedata[1].endswith(', Last>'):
					linedata[0] = '%s..%s' % (first[0], linedata[0])
					linedata[1] = linedata[1].replace(', Last>', '').replace('<', '')
					first = None

				data = {}
				for key, typemap in keys:
					data[key], linedata = typemap(linedata)
				yield data

if __name__ == '__main__':
	try:
		items = sys.argv[3].split(',')
	except:
		items = None
	for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
		if items:
			print ','.join([str(entry[item]) for item in items])
		else:
			print entry