{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "VtxEYym69RUH", "XjAPkfq7SF87" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "WEY5MiKLzurH" }, "source": [ "# Setup Environment" ] }, { "cell_type": "code", "source": [ "! pip install epitran==1.26.0" ], "metadata": { "id": "jviCS0zCmtJc", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e8d100ba-e606-4956-ee15-81ccc6557ba6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting epitran==1.26.0\n", " Downloading epitran-1.26.0-py2.py3-none-any.whl.metadata (34 kB)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (75.2.0)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (2024.11.6)\n", "Collecting panphon>=0.20 (from epitran==1.26.0)\n", " Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)\n", "Requirement already satisfied: marisa-trie in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (1.2.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (2.32.3)\n", "Collecting jamo (from epitran==1.26.0)\n", " Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)\n", "Collecting unicodecsv (from panphon>=0.20->epitran==1.26.0)\n", " Downloading unicodecsv-0.14.1.tar.gz (10 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (6.0.2)\n", "Requirement already satisfied: numpy>=1.20.2 in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (2.0.2)\n", "Requirement already satisfied: editdistance in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (0.8.1)\n", "Collecting munkres (from panphon>=0.20->epitran==1.26.0)\n", " Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (2.4.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (2025.4.26)\n", "Downloading epitran-1.26.0-py2.py3-none-any.whl (188 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.5/188.5 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading panphon-0.21.2-py2.py3-none-any.whl (75 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading jamo-0.4.1-py3-none-any.whl (9.5 kB)\n", "Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)\n", "Building wheels for collected packages: unicodecsv\n", " Building wheel for unicodecsv (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for unicodecsv: filename=unicodecsv-0.14.1-py3-none-any.whl size=10744 sha256=9d5442e17e65cdf34cadb6d4681337702fde69e9bea33a290ccb2bc88151e8b5\n", " Stored in directory: /root/.cache/pip/wheels/ec/03/6f/d2e0162d94c0d451556fa43dd4d5531457245c34a36b41ef4a\n", "Successfully built unicodecsv\n", "Installing collected packages: unicodecsv, munkres, jamo, panphon, epitran\n", "Successfully installed epitran-1.26.0 jamo-0.4.1 munkres-1.1.4 panphon-0.21.2 unicodecsv-0.14.1\n" ] } ] }, { "cell_type": "code", "source": [ "! pip install g2pk==0.9.4" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vxh7pA-mwSDV", "outputId": "f03e0881-3acb-4ab1-fbbb-016a0c4069f3" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting g2pk==0.9.4\n", " Downloading g2pK-0.9.4-py3-none-any.whl.metadata (7.5 kB)\n", "Requirement already satisfied: jamo in /usr/local/lib/python3.11/dist-packages (from g2pk==0.9.4) (0.4.1)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (from g2pk==0.9.4) (3.9.1)\n", "Collecting konlpy (from g2pk==0.9.4)\n", " Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)\n", "Collecting python-mecab-ko (from g2pk==0.9.4)\n", " Downloading python_mecab_ko-1.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n", "Collecting JPype1>=0.7.0 (from konlpy->g2pk==0.9.4)\n", " Downloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n", "Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.11/dist-packages (from konlpy->g2pk==0.9.4) (5.4.0)\n", "Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.11/dist-packages (from konlpy->g2pk==0.9.4) (2.0.2)\n", "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (8.1.8)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (1.4.2)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (2024.11.6)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (4.67.1)\n", "Collecting python-mecab-ko-dic (from python-mecab-ko->g2pk==0.9.4)\n", " Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl.metadata (1.4 kB)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from JPype1>=0.7.0->konlpy->g2pk==0.9.4) (24.2)\n", "Downloading g2pK-0.9.4-py3-none-any.whl (27 kB)\n", "Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.4/19.4 MB\u001b[0m \u001b[31m61.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading python_mecab_ko-1.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (580 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m580.9/580.9 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m494.1/494.1 kB\u001b[0m \u001b[31m33.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl (34.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.5/34.5 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: python-mecab-ko-dic, python-mecab-ko, JPype1, konlpy, g2pk\n", "Successfully installed JPype1-1.5.2 g2pk-0.9.4 konlpy-0.6.0 python-mecab-ko-1.3.7 python-mecab-ko-dic-2.1.1.post2\n" ] } ] }, { "cell_type": "code", "source": [ "! pip install jiwer" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "stR7NfnfZqB1", "outputId": "c5e09b12-0552-4e2d-fd8f-387c8308d1c4" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jiwer\n", " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n", "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n", "Collecting rapidfuzz>=3.9.7 (from jiwer)\n", " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n", "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n", "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n" ] } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PfthI4eOqBri" }, "outputs": [], "source": [ "import os\n", "import re\n", "import csv\n", "import pandas as pd\n", "import json\n", "import itertools\n", "from tqdm import tqdm\n", "from jiwer import cer" ] }, { "cell_type": "markdown", "source": [ "# mapping" ], "metadata": { "id": "VtxEYym69RUH" } }, { "cell_type": "code", "source": [ "output_to_phonetics_map = {\n", " 'м': 'm',\n", " 'ʷ':' v',\n", " 'w': 'v',\n", " 'c': 'k',\n", " 'ĉ': 'C',\n", " 'č': 'C',\n", " '̕': \"?\",\n", " \"'\": '?',\n", " 'ʔ': \"?\",\n", " 'ꞌ': \"?\",\n", " '̛': \"?\",\n", " '’': \"?\",\n", " 'ʼ': \"?\",\n", " \"'\": '?',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " '`': \"?\",\n", " 'ɑ': 'A',\n", " 'ɑ': 'A',\n", " 'ʃ': 'S',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'x',\n", " 'ɣ': 'q',\n", " 'ʒ': 'Z',\n", " ':': '',\n", " 'ː': '',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'A',\n", " 'ʰ': '',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'x',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'u',\n", " \"ˈ\": '?',\n", " 'ù': 'u',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'u',\n", " '_': '',\n", " 'ç': 'C',\n", " 'ĝ': 'q',\n", " 'ɢ': 'q',\n", " 'ː': '',\n", " 'í': 'i',\n", " 'ŝ': 'S',\n", " '!': '',\n", " 'ǧ': 'q',\n", " 'ʻ': '?',\n", " 'è': 'e',\n", " '�': '',\n", " 'ú': 'u',\n", " 'ô': 'o',\n", " 'ē': 'e',\n", " 'à': 'A',\n", " 'ă': 'A',\n", " 'ǐ': 'i',\n", " 'ü': 'u',\n", " '\\u200e': '',\n", " 'ğ': 'q',\n", " 'ṣ': 'S',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 'S',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", "}\n", "\n", "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n", "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n", "\n", "\n", "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n", " substituted = re.sub(r'tʃʰ', 'C', input_string)\n", " substituted = re.sub(r't͡ʃ', 'C', input_string)\n", " substituted = re.sub(r'tʃ', 'C', substituted)\n", " substituted = re.sub(r't͡S', 'C', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('d͡ʒ', 'j', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " return translated" ], "metadata": { "id": "TKx8oA1n7rKh" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XjAPkfq7SF87" }, "source": [ "# Get Evaluation Data" ] }, { "cell_type": "code", "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv" ], "metadata": { "id": "qwCG0jX-88nQ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "bda9ccb4-f4d8-432b-f460-bfcbea7e462b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-05-10 11:19:00-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n", "Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.55, 18.164.174.118, ...\n", "Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 56026 (55K) [text/plain]\n", "Saving to: ‘SentenceBench.csv’\n", "\n", "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.008s \n", "\n", "2025-05-10 11:19:00 (6.90 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "sentence_bench = pd.read_csv('SentenceBench.csv')" ], "metadata": { "id": "hJO-UAPDQvcb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "sentence_bench.head(3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "qlYbrnUa9LAN", "outputId": "2fa1904b-72eb-4df9-9d92-f3918ce8ccf3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " dataset grapheme \\\n", "0 homograph من قدر تو را میدانم \n", "1 homograph از قضای الهی به قدر الهی پناه میبرم \n", "2 homograph به دست و صورتم کرم زدم \n", "\n", " phoneme homograph word \\\n", "0 man qadr-e to rA mi-dAnam قدر \n", "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n", "2 be dast-o suratam kerem zadam کرم \n", "\n", " pronunciation \n", "0 qadr \n", "1 qadar \n", "2 kerem " ], "text/html": [ "\n", "
\n", " | dataset | \n", "grapheme | \n", "phoneme | \n", "homograph word | \n", "pronunciation | \n", "
---|---|---|---|---|---|
0 | \n", "homograph | \n", "من قدر تو را میدانم | \n", "man qadr-e to rA mi-dAnam | \n", "قدر | \n", "qadr | \n", "
1 | \n", "homograph | \n", "از قضای الهی به قدر الهی پناه میبرم | \n", "?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram | \n", "قدر | \n", "qadar | \n", "
2 | \n", "homograph | \n", "به دست و صورتم کرم زدم | \n", "be dast-o suratam kerem zadam | \n", "کرم | \n", "kerem | \n", "