# Setup Environment

In [1]:
!pip install hazm    # Requires restart.

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [1]:
!pip install groq
!pip install jiwer

Collecting groq
  Downloading groq-0.14.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.14.0-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.14.0
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0


In [2]:
import pandas as pd
from groq import Groq
import re
from difflib import SequenceMatcher
from jiwer import cer
from tqdm import tqdm

# Setup LLM

In [46]:
def get_response(messages):
  client = Groq(
    api_key= '',    # Insert API key
  )

  while True:
    try:
      response = client.chat.completions.create(
          model='llama3-70b-8192',
          messages=messages,
      )

      response = response.choices[0].message.content
      return response

    except Exception as e:
      print(e)
      continue


# Get Dictionary

In [4]:
!wget https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv

--2025-01-09 20:32:04--  https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv
Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7945406 (7.6M) [text/plain]
Saving to: ‘KaamelDict.csv’


2025-01-09 20:32:05 (11.5 MB/s) - ‘KaamelDict.csv’ saved [7945406/7945406]



In [5]:
dict_path = "KaamelDict.csv"

In [6]:
dict_df = pd.read_csv(dict_path)

In [7]:
kaamel_dict = {}

for idx, row in dict_df.iterrows():
  g, p = row['grapheme'], ''.join(eval(row['phoneme']))
  if g not in kaamel_dict:
    kaamel_dict[g] = []
  kaamel_dict[g].append(p)

In [8]:
phoneme_to_finglish_map = {
  'A': 'aa',
  'S': 'Sh',
  'Z': 'Zh',
  'q': 'Gh',
  'x': 'Kh',
  'u': 'oo',
  '?': "'",
  'C': 'Ch'
}

def replace_phonetic_characters(input_string, char_map):
    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    return input_string.translate(translation_table)

In [9]:
finglish_kaamel_dict = {}

for k, vs in kaamel_dict.items():
  finglish_vs = []
  for v in vs:
    p = replace_phonetic_characters(v, phoneme_to_finglish_map)
    p = re.sub("([^\w\-\?]|^)'", r'\1', p)
    finglish_vs.append(p)

  finglish_kaamel_dict[k] = finglish_vs

In [10]:
inverted_finglish_kaamel_dict = {}

for key, value_list in finglish_kaamel_dict.items():
    for value in value_list:
        inverted_finglish_kaamel_dict[value] = key

In [11]:
inverted_kaamel_dict = {}

for key, value_list in kaamel_dict.items():
    for value in value_list:
        inverted_kaamel_dict[value] = key

In [12]:
def word_in_dict(word, inverted_dictionary=inverted_finglish_kaamel_dict):
  return word in inverted_dictionary

# Define post-processing

In [13]:
output_to_finglish_map = {
    'м': 'm',
    'ʷ': 'v',
    'w': 'v',
    'q': 'Gh',
    'x': 'Kh',
    'u': 'oo',
    '?': "'",
    'ĉ': 'Ch',
    'č': 'Ch',
    '̕': "'",
    'ʔ': "'",
    'ꞌ': "'",
    '̛':  "'",
    '’': "'",
    'ʼ': "'",
    'ʿ': "'",
    '̓': '',
    'â': 'aa',
    'â': 'aa',
    'ȃ': 'aa',
    'c': 'k',
    'ž': 'Zh',
    'š': 'Sh',
    'W': 'v',
    'β': 'f',
    'е': 'e',
    'х': 'Kh',
    '`': "'",
    'ɑ': 'aa',
    'ɑ': 'aa',
    'ʃ': 'Sh',
    'ð': 'z',
    'ɾ': 'r',
    'æ': 'a',
    'ɪ': 'e',
    'χ': 'Kh',
    'ɣ': 'Gh',
    'ʒ': 'Zh',
    ':': '',
    'ā': 'aa',
    'ː': '',
    'ä': 'aa',
    'á': 'aa',
    'š': 'Sh',
    'ū': 'oo',
    'ś': 's',
    'ī': 'i',
    'î': 'i',
    'é': 'e',
    'ḥ': 'h',
    'ɒ': 'aa',
    'ʰ': 'h',
    'ə': 'e',
    'R': 'r',
    'W': 'v',
    'Q': 'q',
    'T': 't',
    'Y': 'y',
    'P': 'p',
    'D': 'd',
    'F': 'f',
    'H': 'h',
    'J': 'j',
    'L': 'l',
    'X': 'Kh',
    'V': 'v',
    'B': 'b',
    'N': 'n',
    'M': 'm',
    'K': 'k',
    'G': 'g',
    'U': 'u',
    'O': 'o',
    'I': 'i',
    'E': 'e',
    'ا': 'aa',
    'ب': 'b',
    'پ': 'p',
    'ت': 't',
    'ث': 's',
    'ج': 'j',
    'چ': 'Ch',
    'ح': 'h',
    'خ': 'Kh',
    'د': 'd',
    'ذ': 'z',
    'ر': 'r',
    'ز': 'z',
    'ژ': 'Zh',
    'س': 's',
    'ش': 'Sh',
    'ص': 's',
    'ض': 'z',
    'ط': 't',
    'ظ': 'z',
    'ع': "'",
    'غ': 'Gh',
    'ف': 'f',
    'ق': 'Gh',
    'ک': 'k',
    'گ': 'g',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'v',
    'ه': 'h',
    'ی': 'y',
    'ء': "'",
    'ئ': "'",
    'ؤ': "o'",
    'آ': 'aa',
    'أ': "a'",
    'إ': "e'",
    'ۀ': 'eye',
    'ŋ': 'ng',
    '.': '',
    'ɛ': 'e',
    'ʊ': 'oo',
    "ˈ": "'",
    'ù': 'oo',
    'θ': 's',
    '̪': '',
    'ũ': 'oo',
    '_': ''
}


def replace_LLM_characters(input_string, char_map):
    substituted = re.sub(r'tʃʰ', 'ch', input_string)
    substituted = re.sub('tʃ', 'ch', substituted)
    substituted = re.sub(r't͡S', 'ch', substituted)
    substituted = re.sub(r'kʰ', 'k', substituted)
    substituted = re.sub(r'pʰ', 'p', substituted)
    substituted = re.sub(r'tʰ', 't', substituted)
    substituted = re.sub(r'ow', 'o', substituted)
    substituted = re.sub('dʒ', 'j', substituted)

    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    translated = substituted.translate(translation_table)

    return translated

In [14]:
def get_finglish_consonants(word):
  char_map = {
      'ا': '', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',
      'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',
      'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': "'",
      'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
      'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': "'",'ئ': "'", 'ؤ': "'",
      'آ': '', 'أ': "'", 'إ': "'", 'ۀ': 'y'
  }
  mapped_string = ''.join(char_map.get(char, char) for char in word)
  return mapped_string

In [15]:
def get_updated_span(match_span, displacements):
  new_start, new_end = match_span[0], match_span[1]
  for start, displacement in displacements:
    if start <= new_start:
      new_start += displacement
      new_end += displacement

  return (new_start, new_end)

In [16]:
output_to_phonetics_map = {
    'м': 'm',
    'ʷ':' v',
    'w': 'v',
    'c': 'k',
    'ĉ': 'C',
    'č': 'C',
    '̕': "?",
    "'": '?',
    'ʔ': "?",
    'ꞌ': "?",
    '̛':  "?",
    '’': "?",
    'ʼ': "?",
    "'": '?',
    'â': 'A',
    'â': 'A',
    'ȃ': 'A',
    'ž': 'Z',
    'š': 'S',
    'W': 'v',
    'β': 'f',
    'е': 'e',
    '`': "?",
    'ɑ': 'A',
    'ɑ': 'A',
    'ʃ': 'S',
    'ð': 'z',
    'ɾ': 'r',
    'æ': 'a',
    'ɪ': 'e',
    'χ': 'x',
    'ɣ': 'q',
    'ʒ': 'Z',
    ':': '',
    'ː': '',
    'ā': 'A',
    'ː': '',
    'ä': 'A',
    'á': 'A',
    'š': 'S',
    'ū': 'u',
    'û': 'u',
    'ś': 's',
    'ī': 'i',
    'í': 'i',
    'î': 'i',
    'é': 'e',
    'ḥ': 'h',
    'ɒ': 'A',
    'ʰ': '',
    'ə': 'e',
    'R': 'r',
    'W': 'v',
    'Q': 'q',
    'T': 't',
    'Y': 'y',
    'P': 'p',
    'D': 'd',
    'F': 'f',
    'H': 'h',
    'J': 'j',
    'L': 'l',
    'X': 'x',
    'V': 'v',
    'B': 'b',
    'N': 'n',
    'M': 'm',
    'K': 'k',
    'G': 'g',
    'U': 'u',
    'O': 'o',
    'I': 'i',
    'E': 'e',
    'ا': 'A',
    'ب': 'b',
    'پ': 'p',
    'ت': 't',
    'ث': 's',
    'ج': 'j',
    'چ': 'C',
    'ح': 'h',
    'خ': 'x',
    'د': 'd',
    'ذ': 'z',
    'ر': 'r',
    'ز': 'z',
    'ژ': 'Z',
    'س': 's',
    'ش': 'S',
    'ص': 's',
    'ض': 'z',
    'ط': 't',
    'ظ': 'z',
    'ع': "?",
    'غ': 'q',
    'ف': 'f',
    'ق': 'q',
    'ک': 'k',
    'گ': 'g',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'v',
    'ه': 'h',
    'ی': 'y',
    'ء': "?",
    'ئ': "?",
    'ؤ': "o?",
    'آ': 'A',
    'أ': "a?",
    'إ': "e?",
    'ۀ': 'eye',
    'ŋ': 'ng',
    '.': '',
    'ɛ': 'e',
    'ʊ': 'u',
    "ˈ": '?',
    'ù': 'u',
    'θ': 's',
    '̪': '',
    'ũ': 'u',
    '_': '',
    'ç': 'C',
    'ĝ': 'q',
    'ɢ': 'q',
    'ː': '',
    'í': 'i',
    'ŝ': 'S',
    '!': '',
    'ǧ': 'q',
    'ʻ': '?',
    'è': 'e',
    '�': '',
    'ú': 'u',
    'ô': 'o',
    'ē': 'e',
    'à': 'A',
    'ă': 'A',
    'ǐ': 'i',
    'ü': 'u',
    '\u200e': '',
    'ğ': 'q',
    'ṣ': 'S',
    'â': 'A',
    'â': 'A',
    'ȃ': 'A',
    'ž': 'Z',
    'š': 'S',
    'ā': 'A',
    'ː': '',
    'ä': 'A',
    'á': 'A',
    'š': 'S',
    'ū': 'u',
    'û': 'u',
    'ś': 'S',
    'ī': 'i',
    'í': 'i',
    'î': 'i',
    'é': 'e',
}

consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'
vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'


def replace_LLM_phonetic_characters(input_string, char_map, from_phonetics=False):
    if not from_phonetics:
      try:
        input_string = re.sub(r'Sh', 'S', input_string)
        input_string = re.sub(r'Ch', 'C', input_string)
        input_string = re.sub(r'Zh', 'Z', input_string)
        input_string = re.sub(r'Gh', 'q', input_string)
        input_string = re.sub(r'Kh', 'x', input_string)
      except:
        print(input_string)

    substituted = re.sub(r'ch', 'C', input_string)

    substituted = re.sub(r'tʃʰ', 'C', substituted)
    substituted = re.sub(r'tʃ', 'C', substituted)
    substituted = re.sub(r't͡S', 'C', substituted)
    substituted = re.sub(r'ow', 'o', substituted)
    substituted = re.sub('dʒ', 'j', substituted)

    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    translated = substituted.translate(translation_table)

    substituted = re.sub('ee', 'i', translated)
    substituted = re.sub('ii', 'i', substituted)
    substituted = re.sub('oo', 'u', substituted)
    substituted = re.sub('uu', 'u', substituted)
    substituted = re.sub('aa', 'A', substituted)
    substituted = re.sub('AA', 'A', substituted)
    substituted = re.sub('Aa', 'A', substituted)
    substituted = re.sub('aA', 'A', substituted)
    substituted = re.sub(rf'(?:\b(sh)|(sh){consonants_regex}|(sh)\b)', 'S', substituted)
    substituted = re.sub(rf'(?:\b(kh)|(kh){consonants_regex}|(kh)\b)', 'x', substituted)
    substituted = re.sub(rf'(?:\b(zh)|(zh){consonants_regex}|(zh)\b)', 'Z', substituted)
    substituted = re.sub(rf'(?:\b(gh)|(gh){consonants_regex}|(gh)\b)', 'q', substituted)

    substituted = re.sub(rf'([^\w\-\?]|^){vowels_regex}', r'\1?', substituted)
    substituted = substituted.replace('?output=[', '')
    substituted = substituted.replace('[?output=', '')
    substituted = substituted.replace('output=[', '')
    substituted = substituted.replace('[output=', '')
    substituted = substituted.replace('output=', '')
    substituted = substituted.replace('output', '')
    substituted = substituted.replace('[', '')
    substituted = substituted.replace(']', '')
    substituted = substituted.replace('=', '')

    substituted = re.sub(r'[^a-zA-Z\?\s]', '', substituted)

    return substituted

In [17]:
def fix_ambiguities(model_text, gt_text):
  # fix sh
  i = 0
  for c in gt_text:
    if c in 'سصث':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'sS': break

    if c == 'ش':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'S' and model_text[i + 1] != 'h': break
        if model_text[i] in 'Ss' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'S' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break

  # fix zh
  i = 0
  for c in gt_text:
    if c in 'زذضظ':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'zZ': break

    if c == 'ژ':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'Z' and model_text[i + 1] != 'h': break
        if model_text[i] in 'zZ' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'Z' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break


  # fix kh
  i = 0
  for c in gt_text:
    if c == 'ک':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'kK': break

    if c == 'خ':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] in 'xX': break
        if model_text[i] in 'kK' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'x' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break


  # fix gh
  i = 0
  for c in gt_text:
    if c == 'گ':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'Gg': break

    if c in 'غق':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'q': break
        if model_text[i] in 'Gg' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'q' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break

  return model_text

In [18]:
def substitute_by_dict(model_text, gt_text):
  subwords = []
  matched_spans = set()
  for match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", model_text):
      match_text = match.group()
      match_span = match.span()

      finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)
      if finglish_text in inverted_finglish_kaamel_dict and inverted_finglish_kaamel_dict[finglish_text] in gt_text:
        max_sim, max_sim_p = -1, ''
        for p in kaamel_dict[inverted_finglish_kaamel_dict[finglish_text]]:
          phonetic_text = replace_LLM_phonetic_characters(finglish_text, output_to_phonetics_map)
          sim = SequenceMatcher(None, phonetic_text, p).ratio()
          if sim > max_sim:
            max_sim = sim
            max_sim_p = p

        gt_text = gt_text.replace(inverted_finglish_kaamel_dict[finglish_text], '')
        subwords.append((match_span, max_sim_p))
        matched_spans.add(match_span)

  for match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", model_text):
      match_text = match.group()
      match_span = match.span()

      if match_span in matched_spans: continue
      if not 'sh' in match_text and not 'kh' in match_text and not 'zh' in match_text and not 'Sh' in match_text and not 'Kh' in match_text and not 'Zh' in match_text: continue

      finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)
      consonant_finglish = re.sub(r'(е|e|i|u|o|a|ā|ä|â|ā|ɒ|á|A)', '', finglish_text)

      for gt_match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", gt_text):
        gt_match_text = gt_match.group()
        gt_match_span = gt_match.span()

        gt_consonant_finglish = get_finglish_consonants(gt_match_text)
        if SequenceMatcher(None, consonant_finglish, gt_consonant_finglish).ratio() > 0.65:
            subwords.append((match_span, fix_ambiguities(match_text, gt_match_text)))

  displacements = []
  for span, replacement in subwords:
    updates_span = get_updated_span(span, displacements)
    model_text = model_text[:updates_span[0]] + replacement + model_text[updates_span[1]:]
    displacements.append((updates_span[0], len(replacement) - (updates_span[1] - updates_span[0])))

  return model_text

In [19]:
def get_known_words(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):
  words = re.split('\W+', graphemes)
  if multiple_choices:
    return '\n'.join(f'{w}: {", ".join(dictionary[w])}' for w in words if w in dictionary)

  return '\n'.join(f'{w}: {", ".join(dictionary[w])}' for w in words if w in dictionary and len(dictionary[w]) <= 1)

In [20]:
def correct_output_by_llm_and_dict_info_finglish(grapheme, output, multi=True):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": "A model was used to convert Persian sentences into Finglish (Persian written in the Latin alphabet). We have a dictionary with Finglish of some of the words. You are an assistant that corrects the Finglish output of the model choosing the right information from that dictionary. Be careful not to remove the connective Ezafe phonemes '-e' and '-ye' and show ع, ئ, and ٔ with '."
            },
            {
                "role": "user",
                "content": f'''Here is the original Persian sentence: [{grapheme}].
                Here is the Fingish output of the model: [{output}].
                Here is the Finglish to some words I found from dictionary:
                {get_known_words(grapheme, multiple_choices=multi, dictionary=finglish_kaamel_dict)}.
                Please return the corrected Finglish of the Persian sentence in brackets like output=[].'''
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_characters(output, output_to_finglish_map)
      return output

In [21]:
def replace_words_with_dict(text, dictionary=finglish_kaamel_dict):
    pattern = r'\b\w+\b'

    modified_text = re.sub(pattern, lambda match: f'{dictionary[match.group()][0]}' if match.group() in dictionary and len(dictionary[match.group()]) == 1 else match.group(), text)

    return modified_text

In [22]:
def get_known_words_list(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):
  words = re.split('\W+', graphemes)
  if multiple_choices:
    return [(w, dictionary[w]) for w in words if w in dictionary]

  return [(w, dictionary[w]) for w in words if w in dictionary and len(dictionary[w]) <= 1]

In [23]:
def substitute_output_by_dict(grapheme, output, dictionary=finglish_kaamel_dict):
  ACCEPTED_THRESHOLD = 0.65
  output = re.sub(r'([^еeiuoaāäâāɒáA])(-i)', r'\1i', output)

  alternatives = get_known_words_list(grapheme, dictionary=dictionary)
  output_words = re.split('[^-\w\?]+', output)
  pairs = []

  graphemes = []
  for grapheme, phonemes in alternatives:
    graphemes.append(grapheme)

    for j, phoneme in enumerate(phonemes):
      for i, word in enumerate(output_words):
        pairs.append((SequenceMatcher(None, phoneme, word).ratio(), phoneme, word, grapheme))

  sorted_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)

  for score, phoneme, output_word, grapheme in sorted_pairs:
    if score < ACCEPTED_THRESHOLD: break
    if grapheme not in graphemes: continue
    graphemes.remove(grapheme)

    if output_word.endswith('-e'):
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-e'}\3", output)

    elif output_word.endswith('-ye'):
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-ye'}\3", output)

    elif phoneme[-1] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and SequenceMatcher(None, phoneme, word[:-2]).ratio() > score:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-ye'}\3", output)

    elif phoneme[-1] not in 'еeiuoaāäâāɒáA' and output_word.endswith('e') and SequenceMatcher(None, phoneme, word[:-1]).ratio() > score:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-e'}\3", output)

    elif score > ACCEPTED_THRESHOLD:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme}\3", output)

  return output

# Prompt 1: Naive

In [24]:
def prompt1(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": "You are an assistant that converts Persian sentences into their IPA phonemes representation."
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{grapheme}].\n Return the phonemes of it in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      return output

# Prompt 2: In-Context Learning (ICL)

In [25]:
def prompt2(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their IPA phonemes representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "ɾæŋ-e ?ɒːbiː", "زندگی شیرین": "zendegiː-je ʃiːɾiːn").
                3. Use "ʃ" for 'ش', "tʃʰ" for 'چ', "x" for 'خ', "q" for 'ق'; "ɣ" for 'غ', "ʒ" for 'ژ'.

                Additional guidelines:
                - Short vowels: æ (ـَ), e (ـِ), o (ـُ)
                - Long vowels: ɒː (آ/ا), iː (ای), uː (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 'tʰ' for ت and ط; 'j' for ی; 'pʰ' for پ; 'kʰ' for ک; 'ɾ' for ر; 'ŋ' for نگ
                - Omit silent 'h' at the end of words (e.g., خانه → xɒːne, not xɒːneh)
                - Represent ع, ئ , and ء with an ʔ when it's pronounced

                Here are a few examples:
                input=[جریان شال چی بود؟], output=[dʒæɾjɒːn-e ʃɒːl tʃʰiː buːd]
                input=[گل نو در غار هست یا خانه؟], output=[ɡol-e now dæɾ ɣɒːɾ hæst jɒː xɒːne]
                input=[ژن زیبارویان پولدار], output=[ʒen-e ziːbɒːruːjɒːn-e puːldɒːɾ]
                input=[اتفاقی نمی‌افتد], output=[?ettefɒːqiː nemiː-?oftɒːd]
                input=[گرگ حیوانی وحشی است], output=[goɾg hejvɒːniː væhʃiː ?æst]
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{grapheme}].\n Return the phonemes of it in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      return output


# Prompt 3: Finglish

In [26]:
def prompt3(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                ## Examples
                input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].
                input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{grapheme}].\n Return the Finglish of it in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

# Prompt 4: Rule-based Correction

In [27]:
def prompt4(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                ## Examples
                input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].
                input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{grapheme}].\n Return the Finglish of it in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_characters(output, output_to_finglish_map)
      output = substitute_output_by_dict(grapheme, output)
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output


# Prompt 5: LLM-based Correction

In [28]:
def prompt5(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                ## Examples
                input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].
                input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{grapheme}].\n Return the Finglish of it in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_characters(output, output_to_finglish_map)
      output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

# Prompt 6: Dict Hints (1)

In [29]:
def prompt6(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                In the inputs you will be given, the Finglish of some of the words is given. Use the right pronunciations as help.

                ## Examples
                input=[جشن چهل مرد]. dict entries: 'مرد': mord, mard, 'جشن': jaShn, output=[jaShn-e Chehel mard].
                input=[گل نو در غار هست یا خانه؟]. dict entries: 'گل': gol, gel, 'در': dar, dorr, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد]. dict entries: . output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f'''This is the Persian sentence: [{grapheme}].
                These are the pronunciatin of some of the words I know:
                {get_known_words(grapheme, multiple_choices=True)}.
                Return Finglish of the Persian sentenc in brackets like output=[].'''
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

# Prompt 7: Dict Hints (2)

In [30]:
def prompt7(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                In the inputs you will be given, the Finglish of some of the words is given as help.

                ## Examples
                input=[جشن چهل مرد]. dict entries: 'چهل': Chehel, 'جشن': jaShn, output=[jaShn-e Chehel mard].
                input=[گل نو در غار هست یا خانه؟]. dict entries: 'غار': Ghaar, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد]. dict entries: . output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f'''This is the Persian sentence: [{grapheme}].
                These are the pronunciatin of some of the words I know:
                {get_known_words(grapheme, multiple_choices=False)}.
                Return Finglish of the Persian sentenc in brackets like output=[].'''
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

# Prompt 8: Dict Hints (3)

In [31]:
def prompt8(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.

                Here are a few examples of what I want:
                input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].
                input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]
                input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\n Complete the Finglish of it and return the result in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

# Prompt 9: Combined method

In [32]:
def prompt9(grapheme):
  matches = None

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.

                Here are a few examples of what I want:
                input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].
                input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]
                input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\n Complete the Finglish of it and return the result in brackets like output=[]."
            }]

    response = get_response(messages)

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_characters(output, output_to_finglish_map)
      output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

## Get Evaluation Data

# Get Evaluation Data

In [33]:
!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv

--2025-01-09 20:32:28--  https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv
Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56029 (55K) [text/plain]
Saving to: ‘SentenceBench.csv’


2025-01-09 20:32:28 (4.37 MB/s) - ‘SentenceBench.csv’ saved [56029/56029]



In [34]:
sentence_bench = pd.read_csv('SentenceBench.csv', names=['dataset', 'grapheme', 'phoneme', 'homograph word',	'pronunciation'])

In [35]:
sentence_bench.head(3)

Unnamed: 0,dataset,grapheme,phoneme,homograph word,pronunciation
0,dataset,grapheme,phoneme,polyphone word,pronunciation
1,polyphone,من قدر تو را می‌دانم,man qadr-e to rA mi-dAnam,قدر,qadr
2,polyphone,از قضای الهی به قدر الهی پناه می‌برم,?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram,قدر,qadar


### Get ManaTTS Data

In [36]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]

# Convert to a list of tuples
mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

mana_evaluation_data[:1]

[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\u200cبینا ',
  'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\u200cbinA ')]

### Get CommonVoice Data

In [37]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]

# Convert to a list of tuples
commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

commonvoice_evaluation_data[:1]

[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',
  'dar ?aksar-e Sahr-hA, markazi barAye xarid-e  doCarxe vojud dArad.')]

### Get Homograph Data

In [38]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'polyphone'][['grapheme', 'phoneme', 'homograph word',	'pronunciation']]

# Convert to a list of tuples
ambiguous_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

ambiguous_evaluation_data[:1]

[('من قدر تو را می\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr')]

### Full Benchmark Data

In [39]:
benchmark = []

for g, p in mana_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p in commonvoice_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p, w, r in ambiguous_evaluation_data:
  benchmark.append((g, p, w, r))

# Inference

In [40]:
!mkdir llama3-70b-8192

In [None]:
with open('llama3-70b-8192/1.txt', 'w') as f:
  for g, p, w, r in tqdm(benchmark):
    output = prompt1(g)
    output = output.replace('\n', '')
    print(output)

    f.write(f"{output}\n")

# Define Evaluation Metrics

## PER Evaluation

In [43]:
def remove_non_word_chars(text):
    pattern = r'[^\w\s\?]'
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

In [44]:
def remove_white_spaces(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [45]:
def get_word_only_text(text):
  word_only_text = remove_non_word_chars(text)
  extra_space_removed_text = remove_white_spaces(word_only_text)

  return extra_space_removed_text

In [46]:
def get_texts_cer(reference, model_output):
  # Preprocess input texts to only contain word characters
  word_only_reference = get_word_only_text(reference)
  word_only_output = get_word_only_text(model_output)

  # Return +infinity for CER if any of the texts is empty
  if not word_only_reference.strip() or not word_only_output.strip():
    return float('inf')

  return cer(word_only_reference, word_only_output)

In [47]:
def get_avg_cer_of_method(method_outputs, references):
  cers = []
  for idx, o in enumerate(method_outputs):
    cer = get_texts_cer(o, references[idx][1])
    if cer != float('inf'):
      cers.append(cer)

  return sum(cers) / len(cers)

## Ezafe Evaluation

In [48]:
def get_EZ_words_from_ground_truth(text):
  pattern = r'\b(\w+)(-e|-ye)\b'
  matches = re.findall(pattern, text,)

  # Extract the words along with the suffix
  words_with_suffix = [match[0] + match[1] for match in matches]
  EZ_words = [tuple(re.split(r'(?=-)', w)) for w in words_with_suffix]

  return EZ_words

In [49]:
def get_EZ_words_from_phonetic_model_output(text):
    EZ_words = re.findall(r'\b(\w+)(-e|-ye)', text)
    EZ_word_candidates = []

    other_words = re.findall(r'\b(\w+)(?=(?:[^-\w]|$))', text)
    for word in other_words:
      if len(word) >= 4 and word[-3] in 'еeiuoaāäâāɒáA' and word.endswith('ye') and word_in_dict(word[:-2], inverted_kaamel_dict) and not word_in_dict(word, inverted_kaamel_dict) and not word_in_dict(word[:-1], inverted_kaamel_dict):
        EZ_words.append((word[:-2], '-ye'))
        continue

      if len(word) >= 3 and word.endswith('e') and word_in_dict(word[:-1], inverted_kaamel_dict) and not word_in_dict(word, inverted_kaamel_dict):
        EZ_words.append((word[:-1], '-e'))
        continue

      if len(word) >= 4 and word[-3] in 'еeiuoaāäâāɒáA' and word.endswith('ye'):
        EZ_word_candidates.append((word[:-2], '-ye'))
        continue

      if len(word) >= 3 and word.endswith('e'):
        EZ_word_candidates.append((word[:-1], '-e'))

    return EZ_words, EZ_word_candidates

In [50]:
def get_ezafe_TP_FP_TN_FN(gt_finglish, model_finglish):
  gt_word_count = len(re.findall(r'\b\w+(?:-\w+)*\b', gt_finglish))
  gt_EZ_words = get_EZ_words_from_ground_truth(gt_finglish)

  model_EZ_words, model_candidate_EZ_words = get_EZ_words_from_phonetic_model_output(model_finglish)

  TP = 0
  FP = 0
  TN = 0
  FN = 0

  gt_matched_indices = set()
  model_matched_indices = set()
  model_candidate_matched_indices = set()

  for gt_idx, (word, EZ) in enumerate(gt_EZ_words):
    for model_idx, (w, E) in enumerate(model_EZ_words):
      if model_idx not in model_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:
        TP += 1
        gt_matched_indices.add(gt_idx)
        model_matched_indices.add(model_idx)
        break
    else:
      for model_c_idx, (w, E) in enumerate(model_candidate_EZ_words):
        if model_c_idx not in model_candidate_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:
          TP += 1
          gt_matched_indices.add(gt_idx)
          model_candidate_matched_indices.add(model_c_idx)
          break

  # Calculate FP: model_EZ_words that are not TP
  FP = len(model_EZ_words) - (TP - len(list(model_candidate_matched_indices)))

  # Calculate FN: gt_EZ_words that were not detected
  FN = len(gt_EZ_words) - TP

  # Calculate TN: non-Ezafe words that are correctly not detected as Ezafe
  TN = (gt_word_count - len(gt_EZ_words)) - FP

  return TP, FP, TN, FN


In [51]:
def get_ezafe_performance(outputs, references):
  total_TP, total_FP, total_TN, total_FN = 0, 0, 0, 0

  for idx, o in enumerate(outputs):
    TP, FP, TN, FN = get_ezafe_TP_FP_TN_FN(references[idx][1], o)
    total_TP += TP
    total_FP += FP
    total_TN += TN
    total_FN += FN


  total_model_EZ = total_TP + total_FP
  total_gt_EZ = total_TP + total_FN

  total_model_T = total_TP + total_TN

  total_gt_words = total_TP + total_TN + total_FP + total_FN

  accuracy = (total_model_T) / (total_gt_words) * 100
  precision = (total_TP) / (total_model_EZ) * 100
  recall = (total_TP) / (total_gt_EZ) * 100

  return accuracy, precision, recall

## Homograph Evaluation

In [52]:
def get_homograph_performance(outputs, references):
  corrects = 0
  total = 0

  for idx, (g, p, homograph, right) in enumerate(references):
    if homograph != '':
      total += 1
      if right in outputs[idx]:
        corrects += 1

  return corrects / total

# Evaluate Outputs

In [53]:
base_path = 'llama3-70b-8192'

In [54]:
def get_method_outputs(method_name):
    predictions = []
    with open(base_path + f'/{method_name}.txt', 'r') as f:
        predictions = [line for line in f.read().splitlines() if line.strip()]
    return predictions


In [55]:
def print_all_metrics(predictions):
  per = get_avg_cer_of_method(predictions, benchmark) * 100
  acc, prec, recall = get_ezafe_performance(predictions, benchmark)
  homograph = get_homograph_performance(predictions, benchmark) * 100

  print(f"PER: \t\t\t{per:.2f}")
  print(f"ACC, PREC, RECALL, F1: \t{acc:.2f}, {prec:.2f}, {recall:.2f}, {((2 * prec * recall) / (prec + recall)):.2f}")
  print(f"Homograph: \t\t{homograph:.2f}")


In [None]:
# Evaluate prompt 1
print_all_metrics(get_method_outputs('1'))