{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "WEY5MiKLzurH" }, "source": [ "# Setup Environment" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "v0YxPpE7XSdB", "outputId": "5586320a-5326-406f-af68-2baaa1cad8f1" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting hazm==0.10.0\n", " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n", "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n", " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n", "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n", " Downloading flashtext-2.7.tar.gz (14 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n", " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n", "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n", "Collecting numpy==1.24.3 (from hazm==0.10.0)\n", " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n", " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n", "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n", " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n", "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n", " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.5.0)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n", "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n", "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m39.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: flashtext\n", " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=7fbaeca40988cc63186878778821a11ea0a3077720f0c5f64c14cb14f24caaa9\n", " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n", "Successfully built flashtext\n", "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 2.0.2\n", " Uninstalling numpy-2.0.2:\n", " Successfully uninstalled numpy-2.0.2\n", " Attempting uninstall: scipy\n", " Found existing installation: scipy 1.15.2\n", " Uninstalling scipy-1.15.2:\n", " Successfully uninstalled scipy-1.15.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n", "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n", "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n", "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n", "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n", "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n", "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n", "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n" ] }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "numpy" ] }, "id": "6a279841cbfe46c6a5d57887cb3ce1b8" } }, "metadata": {} } ], "source": [ "! pip install hazm==0.10.0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cq_LdJhLTj-G", "outputId": "6ca473ba-b470-4156-a89e-91012e10132f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting numpy==1.26.0\n", " Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m92.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: numpy\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 1.24.3\n", " Uninstalling numpy-1.24.3:\n", " Successfully uninstalled numpy-1.24.3\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n", "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\n", "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed numpy-1.26.0\n", "Collecting pandas==2.1.4\n", " Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", "Requirement already satisfied: numpy<2,>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (1.26.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas==2.1.4) (1.17.0)\n", "Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m103.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: pandas\n", " Attempting uninstall: pandas\n", " Found existing installation: pandas 2.2.2\n", " Uninstalling pandas-2.2.2:\n", " Successfully uninstalled pandas-2.2.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.1.4 which is incompatible.\n", "mizani 0.13.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\n", "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n", "plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed pandas-2.1.4\n" ] } ], "source": [ "!pip install numpy==1.26.0\n", "!pip install pandas==2.1.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-ALZJIsacLHw", "outputId": "ff064e07-5e0a-4666-ab40-09612db588be" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jiwer\n", " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n", "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n", "Collecting rapidfuzz>=3.9.7 (from jiwer)\n", " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n", "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n", "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n" ] } ], "source": [ "! pip install jiwer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "I7f1WhU8cbBh" }, "outputs": [], "source": [ "import os\n", "import re\n", "from tqdm import tqdm\n", "import csv\n", "import pandas as pd\n", "import json\n", "import itertools\n", "from jiwer import cer" ] }, { "cell_type": "markdown", "metadata": { "id": "UloQzMxIcZmv" }, "source": [ "# Setup Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jviCS0zCmtJc", "outputId": "abffb8b9-db25-426b-f1f7-eab08e4abbb9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'G2P'...\n", "remote: Enumerating objects: 130, done.\u001b[K\n", "remote: Counting objects: 100% (9/9), done.\u001b[K\n", "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", "remote: Total 130 (delta 2), reused 0 (delta 0), pack-reused 121 (from 1)\u001b[K\n", "Receiving objects: 100% (130/130), 7.90 MiB | 9.47 MiB/s, done.\n", "Resolving deltas: 100% (46/46), done.\n" ] } ], "source": [ "! git clone https://github.com/mohamad-hasan-sohan-ajini/G2P.git" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "URJJtd4vns2T" }, "outputs": [], "source": [ "! mv G2P/* ./" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kr-UWEf9sl_G" }, "outputs": [], "source": [ "config = '''import os\n", "import json\n", "\n", "import torch\n", "\n", "cpu = torch.device('cpu')\n", "gpu = torch.device('cuda')\n", "\n", "\n", "class DataConfig(object):\n", " language = 'FA'\n", " graphemes_path = f'resources/{language}/Graphemes.json'\n", " phonemes_path = f'resources/{language}/Phonemes.json'\n", " lexicon_path = f'resources/{language}/Lexicon.json'\n", "\n", "\n", "class ModelConfig(object):\n", " with open(DataConfig.graphemes_path) as f:\n", " graphemes_size = len(json.load(f))\n", "\n", " with open(DataConfig.phonemes_path) as f:\n", " phonemes_size = len(json.load(f))\n", "\n", " hidden_size = 128\n", "\n", "\n", "class TrainConfig(object):\n", " device = gpu if torch.cuda.is_available() else cpu\n", " lr = 3e-4\n", " batch_size = 128\n", " epochs = int(os.getenv('EPOCHS', '10'))\n", " log_path = f'log/{DataConfig.language}'\n", "\n", "\n", "class TestConfig(object):\n", " device = cpu\n", " encoder_model_path = f'models/{DataConfig.language}/encoder_e{TrainConfig.epochs:02}.pth'\n", " decoder_model_path = f'models/{DataConfig.language}/decoder_e{TrainConfig.epochs:02}.pth'\n", "'''\n", "\n", "with open('/content/config.py', 'w') as f:\n", " f.write(config)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vBgfRUS3rjtR", "outputId": "84cf30d3-8cc2-4078-9a08-dd0adc089e28" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "2025-05-13 04:29:52.161062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1747110592.186386 842 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1747110592.193885 842 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2025-05-13 04:29:52.219109: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "--------------------epoch: 01--------------------\n", "100% 419/419 [01:39<00:00, 4.20it/s]\n", "--------------------epoch: 02--------------------\n", "100% 419/419 [01:38<00:00, 4.24it/s]\n", "--------------------epoch: 03--------------------\n", "100% 419/419 [01:38<00:00, 4.24it/s]\n", "--------------------epoch: 04--------------------\n", "100% 419/419 [01:42<00:00, 4.11it/s]\n", "--------------------epoch: 05--------------------\n", "100% 419/419 [01:39<00:00, 4.20it/s]\n", "--------------------epoch: 06--------------------\n", "100% 419/419 [01:43<00:00, 4.06it/s]\n", "--------------------epoch: 07--------------------\n", "100% 419/419 [01:41<00:00, 4.11it/s]\n", "--------------------epoch: 08--------------------\n", "100% 419/419 [01:41<00:00, 4.14it/s]\n", "--------------------epoch: 09--------------------\n", "100% 419/419 [01:43<00:00, 4.06it/s]\n", "--------------------epoch: 10--------------------\n", "100% 419/419 [01:43<00:00, 4.04it/s]\n" ] } ], "source": [ "! LANGUAGE=FA python train.py" ] }, { "cell_type": "markdown", "metadata": { "id": "VtxEYym69RUH" }, "source": [ "# mapping" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TKx8oA1n7rKh" }, "outputs": [], "source": [ "output_to_phonetics_map = {\n", " 'м': 'm',\n", " 'ʷ':' v',\n", " 'w': 'v',\n", " 'c': 'k',\n", " 'ĉ': 'C',\n", " 'č': 'C',\n", " '̕': \"?\",\n", " \"'\": '?',\n", " 'ʔ': \"?\",\n", " 'ꞌ': \"?\",\n", " '̛': \"?\",\n", " '’': \"?\",\n", " 'ʼ': \"?\",\n", " \"'\": '?',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " '`': \"?\",\n", " 'ɑ': 'A',\n", " 'ɑ': 'A',\n", " 'ʃ': 'S',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'x',\n", " 'ɣ': 'q',\n", " 'ʒ': 'Z',\n", " ':': '',\n", " 'ː': '',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'A',\n", " 'ʰ': '',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'x',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'u',\n", " \"ˈ\": '?',\n", " 'ù': 'u',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'u',\n", " '_': '',\n", " 'ç': 'C',\n", " 'ĝ': 'q',\n", " 'ɢ': 'q',\n", " 'ː': '',\n", " 'í': 'i',\n", " 'ŝ': 'S',\n", " '!': '',\n", " 'ǧ': 'q',\n", " 'ʻ': '?',\n", " 'è': 'e',\n", " '�': '',\n", " 'ú': 'u',\n", " 'ô': 'o',\n", " 'ē': 'e',\n", " 'à': 'A',\n", " 'ă': 'A',\n", " 'ǐ': 'i',\n", " 'ü': 'u',\n", " '\\u200e': '',\n", " 'ğ': 'q',\n", " 'ṣ': 'S',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 'S',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", "}\n", "\n", "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n", "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n", "\n", "\n", "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n", " substituted = re.sub(r'tʃʰ', 'C', input_string)\n", " substituted = re.sub(r't͡ʃ', 'C', input_string)\n", " substituted = re.sub(r'tʃ', 'C', substituted)\n", " substituted = re.sub(r't͡S', 'C', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('d͡ʒ', 'j', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " return translated" ] }, { "cell_type": "markdown", "metadata": { "id": "XjAPkfq7SF87" }, "source": [ "# Get Evaluation Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qwCG0jX-88nQ", "outputId": "bd5bce45-3a10-476b-a253-a5abf6ec058e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-05-13 04:46:59-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n", "Resolving huggingface.co (huggingface.co)... 3.166.152.65, 3.166.152.44, 3.166.152.110, ...\n", "Connecting to huggingface.co (huggingface.co)|3.166.152.65|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 56026 (55K) [text/plain]\n", "Saving to: ‘SentenceBench.csv’\n", "\n", "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.01s \n", "\n", "2025-05-13 04:46:59 (4.08 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n", "\n" ] } ], "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hJO-UAPDQvcb" }, "outputs": [], "source": [ "sentence_bench = pd.read_csv('SentenceBench.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 178 }, "id": "qlYbrnUa9LAN", "outputId": "2480b1b2-529a-45b0-9cb2-5378abb61256" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " dataset grapheme \\\n", "0 homograph من قدر تو را میدانم \n", "1 homograph از قضای الهی به قدر الهی پناه میبرم \n", "2 homograph به دست و صورتم کرم زدم \n", "\n", " phoneme homograph word \\\n", "0 man qadr-e to rA mi-dAnam قدر \n", "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n", "2 be dast-o suratam kerem zadam کرم \n", "\n", " pronunciation \n", "0 qadr \n", "1 qadar \n", "2 kerem " ], "text/html": [ "\n", "
\n", " | dataset | \n", "grapheme | \n", "phoneme | \n", "homograph word | \n", "pronunciation | \n", "
---|---|---|---|---|---|
0 | \n", "homograph | \n", "من قدر تو را میدانم | \n", "man qadr-e to rA mi-dAnam | \n", "قدر | \n", "qadr | \n", "
1 | \n", "homograph | \n", "از قضای الهی به قدر الهی پناه میبرم | \n", "?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram | \n", "قدر | \n", "qadar | \n", "
2 | \n", "homograph | \n", "به دست و صورتم کرم زدم | \n", "be dast-o suratam kerem zadam | \n", "کرم | \n", "kerem | \n", "