#!/usr/bin/python3 import os import re import sys import math import codecs import xml.etree.ElementTree as etree class Emoji: def __init__(self, m): self.emoji = m.group(1) self.pronunciation = m.group(2) self.codepoints = m.group(3) self.comment = m.group(4) def __repr__(self): return "Emoji(emoji={0}, pronunciation={1}, codepoints={2}, comment={3})".format( repr(self.emoji), repr(self.pronunciation), repr(self.codepoints), repr(self.comment)) def __str__(self): return "{0}{1}// [{2}]{3}".format(self.emoji, self.pronunciation, self.codepoints, self.comment) def read_annotations(filename): ldml = etree.parse(filename).getroot() for annotations in ldml.findall("annotations"): for annotation in annotations.findall("annotation"): if annotation.attrib.get("type", "") == "tts": yield annotation.attrib["cp"], annotation.text def read_emoji(filename, encoding="utf-8"): re_emoji = re.compile(r"^([^ \t]*)([^/]*)// \[([^\]]*)\](.*)$") with codecs.open(filename, "r", encoding) as f: for line in f: line = line.replace("\n", "") if line.strip() == "": yield line # blank line elif line.startswith("//"): yield line # line comment elif line.startswith("$"): yield line # flags only else: m = re_emoji.match(line) if m: yield Emoji(m) else: yield line def find_langname(lang): espeak_data_path = os.path.join(os.getcwd(), "espeak-ng-data") for root, dirnames, filenames in os.walk(espeak_data_path): if lang in filenames: filename = os.path.join(root, lang) with codecs.open(filename, "r", "utf-8") as f: for line in f: line = line.replace("\n", "") if line.startswith("name "): return line.replace("name ", "") def normalize(text): text = text.replace("โ€ž", "") text = text.replace("โ€œ", "") text = text.replace("\"", "") text = text.replace("โ€˜", "'") text = text.replace("ยท", " ") text = text.replace("| ", "") # alternatives, e.g. af "vonkstok | skitterstokkie" return text emoji_dict = sys.argv[1] lang = sys.argv[2] cldr_path = sys.argv[3] filenames = [ os.path.join(cldr_path, "common", "annotations", "{0}.xml".format(lang)), os.path.join(cldr_path, "common", "annotationsDerived", "{0}.xml".format(lang)), os.path.join("data", "annotationsEspeak", "{0}.xml".format(lang)) ] annotations = {} for filename in filenames: if os.path.exists(filename): for cp, name in read_annotations(filename): annotations[cp] = name for entry in read_emoji(emoji_dict): if isinstance(entry, Emoji): translation = annotations.get(entry.emoji.replace("\uFE0F", ""), None) if translation: translation = normalize(translation) length = len(entry.pronunciation.strip()) tabs = entry.pronunciation.count('\t') - 1 first_tab = 8 - (length % 8) tab_length = length + first_tab + ((tabs - 1) * 8) new_length = len(translation) new_tabs = math.ceil((tab_length - new_length)/8) entry.pronunciation = "\t{0}{1}".format(translation, "\t"*int(new_tabs)) else: entry.comment += " (no translation)" entry = "//{0}".format(entry) elif entry == "// Emoji and Other Symbol pronunciations for English": langname = find_langname(lang) entry = "// Emoji and Other Symbol pronunciations for {0}".format(langname) elif entry == "// 2. common/annotations/en.xml (CLDR)": entry = "// 2. common/annotations/{0}.xml (CLDR)".format(lang) print(entry)