In [None]:
! pip install hazm==0.10.0

Collecting hazm==0.10.0
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)
  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy==1.24.3 (from hazm==0.10.0)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)
  Downloading pybind11-2.13.6-py3-none-a

In [None]:
!sudo apt-get install espeak-ng

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 34 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [None]:
import pandas as pd
import re
from jiwer import cer

In [None]:
def transform_text(text):
    """
    Transform text according to specified rules:
    1. Apply phoneme substitutions
    2. Add question marks before vowels at word boundaries
    3. Remove specific symbols

    Args:
        text (str): Input text to transform

    Returns:
        str: Transformed text
    """
    # Define the consonant and vowel regex patterns
    consonants = ['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l',
                 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S',
                 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M']
    vowels = ['a', 'A', 'e', 'i', 'u', 'o']

    consonants_regex = '(?=' + '|'.join(consonants) + ')'
    vowels_regex = '(?=' + '|'.join(vowels) + ')'

    # Step 1: Apply phoneme substitutions
    substitutions = {
        'tS': 'C',
        'j': 'y',
        'dZ': 'j',
        'R': 'r',
        'q1': 'q'
    }

    for old, new in substitutions.items():
        text = text.replace(old, new)

    # Step 3: Remove specific symbols
    symbols_to_remove = ["'", ":", ","]
    for symbol in symbols_to_remove:
        text = text.replace(symbol, '')

    # Step 2: Add question marks before vowels at word boundaries
    text = re.sub(rf'([^\w\-]|^){vowels_regex}', r'\1?', text)

    return text


In [None]:
import subprocess

def text_to_phonemes(text, voice='fa', ipa=False, tie_character=None, separator=None):
    """
    Convert text to phonemes using espeak-ng.

    Args:
        text (str): Input text to convert to phonemes
        voice (str, optional): Voice to use (e.g., 'en-us', 'fr'). Defaults to None (default voice).
        ipa (bool, optional): Use International Phonetic Alphabet. Defaults to False.
        tie_character (str, optional): Character to join multi-letter phonemes. Defaults to None.
        separator (str, optional): Character to separate phonemes. Defaults to None.

    Returns:
        str: Phoneme representation of the input text
    """
    command = ['espeak-ng', '-q', '-x']  # -q for quiet, -x for phoneme output

    if voice:
        command.extend(['-v', voice])
    if ipa:
        command.append('--ipa')
    if tie_character:
        command.extend(['--tie', tie_character])
    if separator:
        command.extend(['--sep', separator])

    # Add the text to process
    command.append(text)

    try:
        result = subprocess.run(
            command,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        phoneme_seq = result.stdout.strip()
        transformed_phoneme_seq = transform_text(phoneme_seq)
        return transformed_phoneme_seq
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"espeak-ng failed: {e.stderr}") from e
    except FileNotFoundError as e:
        raise RuntimeError("espeak-ng is not installed or not in PATH") from e

## Get Evaluation Data

In [None]:
!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv

--2025-05-10 20:59:44--  https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv
Resolving huggingface.co (huggingface.co)... 3.163.189.90, 3.163.189.37, 3.163.189.74, ...
Connecting to huggingface.co (huggingface.co)|3.163.189.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56026 (55K) [text/plain]
Saving to: ‘SentenceBench.csv’


2025-05-10 20:59:44 (6.50 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]



In [None]:
sentence_bench = pd.read_csv('SentenceBench.csv')

In [None]:
sentence_bench.head(3)

Unnamed: 0,dataset,grapheme,phoneme,homograph word,pronunciation
0,homograph,من قدر تو را می‌دانم,man qadr-e to rA mi-dAnam,قدر,qadr
1,homograph,از قضای الهی به قدر الهی پناه می‌برم,?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram,قدر,qadar
2,homograph,به دست و صورتم کرم زدم,be dast-o suratam kerem zadam,کرم,kerem


### Get ManaTTS

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]

# Convert to a list of tuples
mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

mana_evaluation_data[:3]

[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\u200cبینا ',
  'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\u200cbinA '),
 ('به نام بی\u200cوپتیک یا عدسی دورنما آشنا شویم. ',
  'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),
 ('دراین\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',
  'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]

### Get CommonVoice

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]

# Convert to a list of tuples
commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

commonvoice_evaluation_data[:3]

[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',
  'dar ?aksar-e Sahr-hA, markazi barAye xarid-e  doCarxe vojud dArad.'),
 ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',
  'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),
 ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]

### Get Homograph

In [None]:
filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',	'pronunciation']]

# Convert to a list of tuples
homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))

homograph_evaluation_data[:3]

[('من قدر تو را می\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),
 ('از قضای الهی به قدر الهی پناه می\u200cبرم',
  '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',
  'قدر',
  'qadar'),
 ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]

# Evaluate Method Outputs

## PER Evaluation

In [None]:
def remove_non_word_chars(text):
    pattern = r'[^\w\s\?]'
    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = re.sub('_', '', text)
    return cleaned_text

In [None]:
def remove_white_spaces(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [None]:
def get_word_only_text(text):
  word_only_text = remove_non_word_chars(text)
  extra_space_removed_text = remove_white_spaces(word_only_text)

  return extra_space_removed_text

In [None]:
def get_texts_cer(reference, model_output):
  # Preprocess input texts to only contain word characters
  word_only_reference = get_word_only_text(reference)
  word_only_output = get_word_only_text(model_output)

  # Return +infinity for CER if any of the texts is empty
  if not word_only_reference.strip() or not word_only_output.strip():
    return float('inf')

  return cer(word_only_reference, word_only_output)

In [None]:
def get_avg_cer_of_method(method_outputs, references):
  cers = []
  for idx, o in enumerate(method_outputs):
    cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))
    if cer != float('inf'):
      cers.append(cer)

  return sum(cers) / len(cers)

## Homograph Evaluation

In [None]:
def get_homograph_performance(outputs, references):
  corrects = 0
  total = 0

  for idx, (g, p, homograph, right) in enumerate(references):
    if homograph != '':
      total += 1
      if right in outputs[idx]:
        corrects += 1

  return corrects / total

# Full bench

In [None]:
benchmark = []

for g, p in mana_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p in commonvoice_evaluation_data:
  benchmark.append((g, p, '', ''))

for g, p, w, r in homograph_evaluation_data:
  benchmark.append((g, p, w, r))

benchmark = benchmark[:400]

In [None]:
def print_all_metrics(predictions):
  per = get_avg_cer_of_method(predictions, benchmark) * 100
  homograph = get_homograph_performance(predictions, benchmark) * 100

  print(f"PER: \t\t\t{per:.4f}")
  print(f"HOMOGRAPH: \t\t{homograph:.4f}")

# Inference

In [None]:
import time

In [None]:
start_time = time.time()

mapped_outputs = [text_to_phonemes(item[0]) for item in benchmark]

total_time = time.time() - start_time
avg_time = total_time / len(benchmark) if len(benchmark) > 0 else 0

# Results

In [None]:
print_all_metrics(mapped_outputs)
print(f"TOTAL TIME:\t\t{total_time:.2f} (s)")
print(f"AVG TIME:\t\t{avg_time:.4f} (s)+")

PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		6.82 (s)
AVG TIME:		0.0170 (s)+


# Runs

## First:

```
PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		6.58 (s)
AVG TIME:		0.0165 (s)+
```

## Second

```
PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		6.67 (s)
AVG TIME:		0.0167 (s)+
```

## Third

```
PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		6.65 (s)
AVG TIME:		0.0166 (s)+
```

## Fourth

```
PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		7.16 (s)
AVG TIME:		0.0179 (s)+
```

## Fifth

```
PER: 			6.9152
HOMOGRAPH: 		43.8679
TOTAL TIME:		6.82 (s)
AVG TIME:		0.0170 (s)+
```