mahta.fetrat
/
Persian-G2P-Tools-Benchmark


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476
							{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WEY5MiKLzurH"
      },
      "source": [
        "# Setup Environment"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "v0YxPpE7XSdB",
        "outputId": "5586320a-5326-406f-af68-2baaa1cad8f1"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting hazm==0.10.0\n",
            "  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
            "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n",
            "  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
            "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n",
            "  Downloading flashtext-2.7.tar.gz (14 kB)\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n",
            "  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
            "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
            "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
            "  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
            "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n",
            "  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
            "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
            "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n",
            "  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
            "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
            "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n",
            "  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.5.0)\n",
            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
            "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
            "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
            "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m39.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
            "  Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=7fbaeca40988cc63186878778821a11ea0a3077720f0c5f64c14cb14f24caaa9\n",
            "  Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
            "Successfully built flashtext\n",
            "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n",
            "  Attempting uninstall: numpy\n",
            "    Found existing installation: numpy 2.0.2\n",
            "    Uninstalling numpy-2.0.2:\n",
            "      Successfully uninstalled numpy-2.0.2\n",
            "  Attempting uninstall: scipy\n",
            "    Found existing installation: scipy 1.15.2\n",
            "    Uninstalling scipy-1.15.2:\n",
            "      Successfully uninstalled scipy-1.15.2\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
            "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
            "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
            "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
            "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
            "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
            "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
            "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
            "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
            "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.colab-display-data+json": {
              "pip_warning": {
                "packages": [
                  "numpy"
                ]
              },
              "id": "6a279841cbfe46c6a5d57887cb3ce1b8"
            }
          },
          "metadata": {}
        }
      ],
      "source": [
        "! pip install hazm==0.10.0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cq_LdJhLTj-G",
        "outputId": "6ca473ba-b470-4156-a89e-91012e10132f"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting numpy==1.26.0\n",
            "  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m92.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: numpy\n",
            "  Attempting uninstall: numpy\n",
            "    Found existing installation: numpy 1.24.3\n",
            "    Uninstalling numpy-1.24.3:\n",
            "      Successfully uninstalled numpy-1.24.3\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n",
            "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\n",
            "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed numpy-1.26.0\n",
            "Collecting pandas==2.1.4\n",
            "  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
            "Requirement already satisfied: numpy<2,>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (1.26.0)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2.9.0.post0)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n",
            "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas==2.1.4) (1.17.0)\n",
            "Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m103.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: pandas\n",
            "  Attempting uninstall: pandas\n",
            "    Found existing installation: pandas 2.2.2\n",
            "    Uninstalling pandas-2.2.2:\n",
            "      Successfully uninstalled pandas-2.2.2\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.1.4 which is incompatible.\n",
            "mizani 0.13.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\n",
            "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
            "plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed pandas-2.1.4\n"
          ]
        }
      ],
      "source": [
        "!pip install numpy==1.26.0\n",
        "!pip install pandas==2.1.4"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-ALZJIsacLHw",
        "outputId": "ff064e07-5e0a-4666-ab40-09612db588be"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting jiwer\n",
            "  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
            "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
            "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
            "  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
            "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
            "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
            "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
          ]
        }
      ],
      "source": [
        "! pip install jiwer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "I7f1WhU8cbBh"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import re\n",
        "from tqdm import tqdm\n",
        "import csv\n",
        "import pandas as pd\n",
        "import json\n",
        "import itertools\n",
        "from jiwer import cer"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UloQzMxIcZmv"
      },
      "source": [
        "# Setup Model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jviCS0zCmtJc",
        "outputId": "abffb8b9-db25-426b-f1f7-eab08e4abbb9"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'G2P'...\n",
            "remote: Enumerating objects: 130, done.\u001b[K\n",
            "remote: Counting objects: 100% (9/9), done.\u001b[K\n",
            "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
            "remote: Total 130 (delta 2), reused 0 (delta 0), pack-reused 121 (from 1)\u001b[K\n",
            "Receiving objects: 100% (130/130), 7.90 MiB | 9.47 MiB/s, done.\n",
            "Resolving deltas: 100% (46/46), done.\n"
          ]
        }
      ],
      "source": [
        "! git clone https://github.com/mohamad-hasan-sohan-ajini/G2P.git"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "URJJtd4vns2T"
      },
      "outputs": [],
      "source": [
        "! mv G2P/* ./"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kr-UWEf9sl_G"
      },
      "outputs": [],
      "source": [
        "config = '''import os\n",
        "import json\n",
        "\n",
        "import torch\n",
        "\n",
        "cpu = torch.device('cpu')\n",
        "gpu = torch.device('cuda')\n",
        "\n",
        "\n",
        "class DataConfig(object):\n",
        "    language = 'FA'\n",
        "    graphemes_path = f'resources/{language}/Graphemes.json'\n",
        "    phonemes_path = f'resources/{language}/Phonemes.json'\n",
        "    lexicon_path = f'resources/{language}/Lexicon.json'\n",
        "\n",
        "\n",
        "class ModelConfig(object):\n",
        "    with open(DataConfig.graphemes_path) as f:\n",
        "        graphemes_size = len(json.load(f))\n",
        "\n",
        "    with open(DataConfig.phonemes_path) as f:\n",
        "        phonemes_size = len(json.load(f))\n",
        "\n",
        "    hidden_size = 128\n",
        "\n",
        "\n",
        "class TrainConfig(object):\n",
        "    device = gpu if torch.cuda.is_available() else cpu\n",
        "    lr = 3e-4\n",
        "    batch_size = 128\n",
        "    epochs = int(os.getenv('EPOCHS', '10'))\n",
        "    log_path = f'log/{DataConfig.language}'\n",
        "\n",
        "\n",
        "class TestConfig(object):\n",
        "    device = cpu\n",
        "    encoder_model_path = f'models/{DataConfig.language}/encoder_e{TrainConfig.epochs:02}.pth'\n",
        "    decoder_model_path = f'models/{DataConfig.language}/decoder_e{TrainConfig.epochs:02}.pth'\n",
        "'''\n",
        "\n",
        "with open('/content/config.py', 'w') as f:\n",
        "  f.write(config)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vBgfRUS3rjtR",
        "outputId": "84cf30d3-8cc2-4078-9a08-dd0adc089e28"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "2025-05-13 04:29:52.161062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
            "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
            "E0000 00:00:1747110592.186386     842 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
            "E0000 00:00:1747110592.193885     842 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
            "2025-05-13 04:29:52.219109: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
            "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
            "--------------------epoch: 01--------------------\n",
            "100% 419/419 [01:39<00:00,  4.20it/s]\n",
            "--------------------epoch: 02--------------------\n",
            "100% 419/419 [01:38<00:00,  4.24it/s]\n",
            "--------------------epoch: 03--------------------\n",
            "100% 419/419 [01:38<00:00,  4.24it/s]\n",
            "--------------------epoch: 04--------------------\n",
            "100% 419/419 [01:42<00:00,  4.11it/s]\n",
            "--------------------epoch: 05--------------------\n",
            "100% 419/419 [01:39<00:00,  4.20it/s]\n",
            "--------------------epoch: 06--------------------\n",
            "100% 419/419 [01:43<00:00,  4.06it/s]\n",
            "--------------------epoch: 07--------------------\n",
            "100% 419/419 [01:41<00:00,  4.11it/s]\n",
            "--------------------epoch: 08--------------------\n",
            "100% 419/419 [01:41<00:00,  4.14it/s]\n",
            "--------------------epoch: 09--------------------\n",
            "100% 419/419 [01:43<00:00,  4.06it/s]\n",
            "--------------------epoch: 10--------------------\n",
            "100% 419/419 [01:43<00:00,  4.04it/s]\n"
          ]
        }
      ],
      "source": [
        "! LANGUAGE=FA python train.py"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VtxEYym69RUH"
      },
      "source": [
        "# mapping"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TKx8oA1n7rKh"
      },
      "outputs": [],
      "source": [
        "output_to_phonetics_map = {\n",
        "    'м': 'm',\n",
        "    'ʷ':' v',\n",
        "    'w': 'v',\n",
        "    'c': 'k',\n",
        "    'ĉ': 'C',\n",
        "    'č': 'C',\n",
        "    '̕': \"?\",\n",
        "    \"'\": '?',\n",
        "    'ʔ': \"?\",\n",
        "    'ꞌ': \"?\",\n",
        "    '̛':  \"?\",\n",
        "    '’': \"?\",\n",
        "    'ʼ': \"?\",\n",
        "    \"'\": '?',\n",
        "    'â': 'A',\n",
        "    'â': 'A',\n",
        "    'ȃ': 'A',\n",
        "    'ž': 'Z',\n",
        "    'š': 'S',\n",
        "    'W': 'v',\n",
        "    'β': 'f',\n",
        "    'е': 'e',\n",
        "    '`': \"?\",\n",
        "    'ɑ': 'A',\n",
        "    'ɑ': 'A',\n",
        "    'ʃ': 'S',\n",
        "    'ð': 'z',\n",
        "    'ɾ': 'r',\n",
        "    'æ': 'a',\n",
        "    'ɪ': 'e',\n",
        "    'χ': 'x',\n",
        "    'ɣ': 'q',\n",
        "    'ʒ': 'Z',\n",
        "    ':': '',\n",
        "    'ː': '',\n",
        "    'ā': 'A',\n",
        "    'ː': '',\n",
        "    'ä': 'A',\n",
        "    'á': 'A',\n",
        "    'š': 'S',\n",
        "    'ū': 'u',\n",
        "    'û': 'u',\n",
        "    'ś': 's',\n",
        "    'ī': 'i',\n",
        "    'í': 'i',\n",
        "    'î': 'i',\n",
        "    'é': 'e',\n",
        "    'ḥ': 'h',\n",
        "    'ɒ': 'A',\n",
        "    'ʰ': '',\n",
        "    'ə': 'e',\n",
        "    'R': 'r',\n",
        "    'W': 'v',\n",
        "    'Q': 'q',\n",
        "    'T': 't',\n",
        "    'Y': 'y',\n",
        "    'P': 'p',\n",
        "    'D': 'd',\n",
        "    'F': 'f',\n",
        "    'H': 'h',\n",
        "    'J': 'j',\n",
        "    'L': 'l',\n",
        "    'X': 'x',\n",
        "    'V': 'v',\n",
        "    'B': 'b',\n",
        "    'N': 'n',\n",
        "    'M': 'm',\n",
        "    'K': 'k',\n",
        "    'G': 'g',\n",
        "    'U': 'u',\n",
        "    'O': 'o',\n",
        "    'I': 'i',\n",
        "    'E': 'e',\n",
        "    'ŋ': 'ng',\n",
        "    '.': '',\n",
        "    'ɛ': 'e',\n",
        "    'ʊ': 'u',\n",
        "    \"ˈ\": '?',\n",
        "    'ù': 'u',\n",
        "    'θ': 's',\n",
        "    '̪': '',\n",
        "    'ũ': 'u',\n",
        "    '_': '',\n",
        "    'ç': 'C',\n",
        "    'ĝ': 'q',\n",
        "    'ɢ': 'q',\n",
        "    'ː': '',\n",
        "    'í': 'i',\n",
        "    'ŝ': 'S',\n",
        "    '!': '',\n",
        "    'ǧ': 'q',\n",
        "    'ʻ': '?',\n",
        "    'è': 'e',\n",
        "    '�': '',\n",
        "    'ú': 'u',\n",
        "    'ô': 'o',\n",
        "    'ē': 'e',\n",
        "    'à': 'A',\n",
        "    'ă': 'A',\n",
        "    'ǐ': 'i',\n",
        "    'ü': 'u',\n",
        "    '\\u200e': '',\n",
        "    'ğ': 'q',\n",
        "    'ṣ': 'S',\n",
        "    'â': 'A',\n",
        "    'â': 'A',\n",
        "    'ȃ': 'A',\n",
        "    'ž': 'Z',\n",
        "    'š': 'S',\n",
        "    'ā': 'A',\n",
        "    'ː': '',\n",
        "    'ä': 'A',\n",
        "    'á': 'A',\n",
        "    'š': 'S',\n",
        "    'ū': 'u',\n",
        "    'û': 'u',\n",
        "    'ś': 'S',\n",
        "    'ī': 'i',\n",
        "    'í': 'i',\n",
        "    'î': 'i',\n",
        "    'é': 'e',\n",
        "}\n",
        "\n",
        "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n",
        "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n",
        "\n",
        "\n",
        "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n",
        "    substituted = re.sub(r'tʃʰ', 'C', input_string)\n",
        "    substituted = re.sub(r't͡ʃ', 'C', input_string)\n",
        "    substituted = re.sub(r'tʃ', 'C', substituted)\n",
        "    substituted = re.sub(r't͡S', 'C', substituted)\n",
        "    substituted = re.sub(r'ow', 'o', substituted)\n",
        "    substituted = re.sub('d͡ʒ', 'j', substituted)\n",
        "    substituted = re.sub('dʒ', 'j', substituted)\n",
        "\n",
        "    # Create a translation table using str.maketrans\n",
        "    translation_table = str.maketrans(char_map)\n",
        "\n",
        "    # Use str.translate to replace characters based on the translation table\n",
        "    translated = substituted.translate(translation_table)\n",
        "\n",
        "    return translated"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XjAPkfq7SF87"
      },
      "source": [
        "# Get Evaluation Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qwCG0jX-88nQ",
        "outputId": "bd5bce45-3a10-476b-a253-a5abf6ec058e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2025-05-13 04:46:59--  https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
            "Resolving huggingface.co (huggingface.co)... 3.166.152.65, 3.166.152.44, 3.166.152.110, ...\n",
            "Connecting to huggingface.co (huggingface.co)|3.166.152.65|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 56026 (55K) [text/plain]\n",
            "Saving to: ‘SentenceBench.csv’\n",
            "\n",
            "\rSentenceBench.csv     0%[                    ]       0  --.-KB/s               \rSentenceBench.csv   100%[===================>]  54.71K  --.-KB/s    in 0.01s   \n",
            "\n",
            "2025-05-13 04:46:59 (4.08 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hJO-UAPDQvcb"
      },
      "outputs": [],
      "source": [
        "sentence_bench = pd.read_csv('SentenceBench.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 178
        },
        "id": "qlYbrnUa9LAN",
        "outputId": "2480b1b2-529a-45b0-9cb2-5378abb61256"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "     dataset                              grapheme  \\\n",
              "0  homograph                  من قدر تو را می‌دانم   \n",
              "1  homograph  از قضای الهی به قدر الهی پناه می‌برم   \n",
              "2  homograph                به دست و صورتم کرم زدم   \n",
              "\n",
              "                                             phoneme homograph word  \\\n",
              "0                          man qadr-e to rA mi-dAnam            قدر   \n",
              "1  ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram            قدر   \n",
              "2                      be dast-o suratam kerem zadam            کرم   \n",
              "\n",
              "  pronunciation  \n",
              "0          qadr  \n",
              "1         qadar  \n",
              "2         kerem  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-379c9afc-5742-4ce3-adb7-b20dc765b21c\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>dataset</th>\n",
              "      <th>grapheme</th>\n",
              "      <th>phoneme</th>\n",
              "      <th>homograph word</th>\n",
              "      <th>pronunciation</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>homograph</td>\n",
              "      <td>من قدر تو را می‌دانم</td>\n",
              "      <td>man qadr-e to rA mi-dAnam</td>\n",
              "      <td>قدر</td>\n",
              "      <td>qadr</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>homograph</td>\n",
              "      <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
              "      <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
              "      <td>قدر</td>\n",
              "      <td>qadar</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>homograph</td>\n",
              "      <td>به دست و صورتم کرم زدم</td>\n",
              "      <td>be dast-o suratam kerem zadam</td>\n",
              "      <td>کرم</td>\n",
              "      <td>kerem</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-379c9afc-5742-4ce3-adb7-b20dc765b21c')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-379c9afc-5742-4ce3-adb7-b20dc765b21c button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-379c9afc-5742-4ce3-adb7-b20dc765b21c');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    <div id=\"df-4043d60b-3378-49a9-893c-76777c4218a3\">\n",
              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-4043d60b-3378-49a9-893c-76777c4218a3')\"\n",
              "                title=\"Suggest charts\"\n",
              "                style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "      </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "      <script>\n",
              "        async function quickchart(key) {\n",
              "          const quickchartButtonEl =\n",
              "            document.querySelector('#' + key + ' button');\n",
              "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "          try {\n",
              "            const charts = await google.colab.kernel.invokeFunction(\n",
              "                'suggestCharts', [key], {});\n",
              "          } catch (error) {\n",
              "            console.error('Error during call to suggestCharts:', error);\n",
              "          }\n",
              "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "        }\n",
              "        (() => {\n",
              "          let quickchartButtonEl =\n",
              "            document.querySelector('#df-4043d60b-3378-49a9-893c-76777c4218a3 button');\n",
              "          quickchartButtonEl.style.display =\n",
              "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "        })();\n",
              "      </script>\n",
              "    </div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "sentence_bench",
              "summary": "{\n  \"name\": \"sentence_bench\",\n  \"rows\": 400,\n  \"fields\": [\n    {\n      \"column\": \"dataset\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"homograph\",\n          \"mana-tts\",\n          \"commonvoice\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"grapheme\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 400,\n        \"samples\": [\n          \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n          \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n          \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"phoneme\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 400,\n        \"samples\": [\n          \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n          \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n          \"dud-o meh-e qalizi dar mohit piCide bud\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"homograph word\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 101,\n        \"samples\": [\n          \"\\u06af\\u0631\\u06cc\\u0645\",\n          \"\\u0633\\u0628\\u06a9\\u06cc\",\n          \"\\u06a9\\u0645\\u06cc\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"pronunciation\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 210,\n        \"samples\": [\n          \"darham\",\n          \"Sum\",\n          \"moSk\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 11
        }
      ],
      "source": [
        "sentence_bench.head(3)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wDV7ysXf2b_H"
      },
      "source": [
        "### Get ManaTTS"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TcL5ZLvSSnVB",
        "outputId": "08ac8a87-ed45-4c9c-9136-02e7604062e5"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
              "  'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
              " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
              "  'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
              " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
              "  'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
        "\n",
        "# Convert to a list of tuples\n",
        "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
        "\n",
        "mana_evaluation_data[:3]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Jjacw9Mp2eoX"
      },
      "source": [
        "### Get CommonVoice"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-yQnqCGw26sk",
        "outputId": "1bce4c39-87df-4b6a-e7b4-f53b8aca3deb"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
              "  'dar ?aksar-e Sahr-hA, markazi barAye xarid-e  doCarxe vojud dArad.'),\n",
              " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
              "  'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
              " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ],
      "source": [
        "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
        "\n",
        "# Convert to a list of tuples\n",
        "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
        "\n",
        "commonvoice_evaluation_data[:3]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ciSPyhRc3Rvo"
      },
      "source": [
        "### Get Homograph"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XlFc5JbN3Rvz",
        "outputId": "2cf275a5-fc3b-4976-ab7b-4471cc71a211"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
              " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
              "  '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
              "  'قدر',\n",
              "  'qadar'),\n",
              " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ],
      "source": [
        "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
        "\n",
        "# Convert to a list of tuples\n",
        "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
        "\n",
        "homograph_evaluation_data[:3]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R6PE5ds45TPr"
      },
      "source": [
        "# Evaluate Method Outputs"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CLKaERek4u_D"
      },
      "source": [
        "## PER Evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nBee9xG54u_E"
      },
      "outputs": [],
      "source": [
        "def remove_non_word_chars(text):\n",
        "    pattern = r'[^\\w\\s\\?]'\n",
        "    cleaned_text = re.sub(pattern, ' ', text)\n",
        "    return cleaned_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "W8PoNV9V4u_E"
      },
      "outputs": [],
      "source": [
        "def remove_white_spaces(text):\n",
        "    cleaned_text = re.sub(r'\\s+', ' ', text)\n",
        "    return cleaned_text.strip()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YD0cvnn74u_E"
      },
      "outputs": [],
      "source": [
        "def get_word_only_text(text):\n",
        "  word_only_text = remove_non_word_chars(text)\n",
        "  extra_space_removed_text = remove_white_spaces(word_only_text)\n",
        "\n",
        "  return extra_space_removed_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6OQQDual4u_E"
      },
      "outputs": [],
      "source": [
        "def get_texts_cer(reference, model_output):\n",
        "  # Preprocess input texts to only contain word characters\n",
        "  word_only_reference = get_word_only_text(reference)\n",
        "  word_only_output = get_word_only_text(model_output)\n",
        "\n",
        "  # Return +infinity for CER if any of the texts is empty\n",
        "  if not word_only_reference.strip() or not word_only_output.strip():\n",
        "    return float('inf')\n",
        "\n",
        "  return cer(word_only_reference, word_only_output)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ncWQnPdW4u_E"
      },
      "outputs": [],
      "source": [
        "def get_avg_cer_of_method(method_outputs, references):\n",
        "  cers = []\n",
        "  for idx, o in enumerate(method_outputs):\n",
        "    cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
        "    if cer != float('inf'):\n",
        "      cers.append(cer)\n",
        "\n",
        "  return sum(cers) / len(cers)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oBgNtpFQDwku"
      },
      "source": [
        "## Homograph Evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "J445ULEvEEDn"
      },
      "outputs": [],
      "source": [
        "def get_homograph_performance(outputs, references):\n",
        "  corrects = 0\n",
        "  total = 0\n",
        "\n",
        "  for idx, (g, p, homograph, right) in enumerate(references):\n",
        "    if homograph != '':\n",
        "      total += 1\n",
        "      if right in outputs[idx]:\n",
        "        corrects += 1\n",
        "\n",
        "  return corrects / total"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JGEUIrbi9kNH"
      },
      "source": [
        "# Full bench"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fGzQvL8V9mln"
      },
      "outputs": [],
      "source": [
        "benchmark = []\n",
        "\n",
        "for g, p in mana_evaluation_data:\n",
        "  benchmark.append((g, p, '', ''))\n",
        "\n",
        "for g, p in commonvoice_evaluation_data:\n",
        "  benchmark.append((g, p, '', ''))\n",
        "\n",
        "for g, p, w, r in homograph_evaluation_data:\n",
        "  benchmark.append((g, p, w, r))\n",
        "\n",
        "benchmark = benchmark[:400]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DpSqE5oPbmAy"
      },
      "outputs": [],
      "source": [
        "def print_all_metrics(predictions):\n",
        "  per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
        "  homograph = get_homograph_performance(predictions, benchmark) * 100\n",
        "\n",
        "  print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
        "  print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JzSaBXW6XJng"
      },
      "source": [
        "# Sentece"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DhIW_vDBXrSA"
      },
      "outputs": [],
      "source": [
        "import subprocess\n",
        "\n",
        "def run_script_with_args(sent):\n",
        "    try:\n",
        "        command = [\"python\", \"test.py\", \"--word\", sent]\n",
        "        result = subprocess.run(command, capture_output=True, text=True)\n",
        "\n",
        "        if result.returncode == 0:\n",
        "          return result.stdout.replace(\"<eos>\\n\", \"\")\n",
        "        else:\n",
        "            print(f\"An error occurred: {result.stderr}\")\n",
        "            return ''\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"An unexpected error occurred: {str(e)}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PNuCMdIuVPf5"
      },
      "outputs": [],
      "source": [
        "from hazm import WordTokenizer, Normalizer\n",
        "\n",
        "tokenizer = WordTokenizer()\n",
        "normalizer = Normalizer()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Q-brVhjthjZO"
      },
      "outputs": [],
      "source": [
        "def remove_non_word_chars(text):\n",
        "    pattern = r'[^\\w\\s]'\n",
        "    cleaned_text = re.sub(pattern, ' ', text)\n",
        "    return cleaned_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rTPVMXEGU_9r"
      },
      "outputs": [],
      "source": [
        "def sentence_inference(sent):\n",
        "  phonemes = []\n",
        "  tokens = tokenizer.tokenize(normalizer.normalize(sent))\n",
        "  for token in tokens:\n",
        "    token = remove_non_word_chars(token).replace('_', '').replace(\" \",\"\").replace('ۀ', 'ه‌ی')\n",
        "    phoneme = run_script_with_args(token).replace(\" \",\"\")\n",
        "    phonemes.append(phoneme)\n",
        "\n",
        "  phoneme = ' '.join(phonemes)\n",
        "  return phoneme"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fRaAhTMsMHBJ"
      },
      "source": [
        "# outputs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PYn9z4GiMHBf",
        "outputId": "aa2b1f9a-28ff-462b-e7ab-6c0e576d02e6"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            " 97%|█████████▋| 388/400 [3:01:12<04:12, 21.06s/it]"
          ]
        }
      ],
      "source": [
        "from tqdm import tqdm\n",
        "import time\n",
        "\n",
        "outputs = []\n",
        "start_time = time.time()  # Start timer\n",
        "\n",
        "for g, p, _, _ in tqdm(benchmark):\n",
        "    g = g.replace('ء', '')\n",
        "    g = g.replace('ؤ', 'و')\n",
        "\n",
        "    o = sentence_inference(g)\n",
        "    outputs.append(o)\n",
        "\n",
        "# Timing results\n",
        "total_time = time.time() - start_time\n",
        "avg_time = total_time / len(benchmark) if benchmark else 0"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "len(outputs)"
      ],
      "metadata": {
        "id": "JOT3ses7o1Q-",
        "outputId": "d2b800bc-e7b8-4094-8d8d-708c62d6ae5b",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "400"
            ]
          },
          "metadata": {},
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "plLWMbWboXdo"
      },
      "outputs": [],
      "source": [
        "mapped_outputs = []\n",
        "for o in outputs:\n",
        "  o = o.replace('.', '')\n",
        "  mapped = replace_phonetic_characters(o)\n",
        "  mapped_outputs.append(mapped)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zP4Tcj285Ij0",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "d38b4562-b949-4c4f-aded-079ca69f2e68"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "PER: \t\t\t16.8950\n",
            "HOMOGRAPH: \t\t31.1321\n",
            "TOTAL TIME:\t\t11167.3797 (s)\n",
            "AVG TIME:\t\t27.9184 (s)+\n"
          ]
        }
      ],
      "source": [
        "print_all_metrics(mapped_outputs)\n",
        "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
        "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)+\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dgcrW_NEnzBC"
      },
      "source": [
        "# Runs\n",
        "\n",
        "## First:\n",
        "\n",
        "```\n",
        "PER: \t\t\t21.1506\n",
        "HOMOGRAPH: \t\t29.7170\n",
        "TOTAL TIME:\t\t10962.4703 (s)\n",
        "AVG TIME:\t\t27.4062 (s)+\n",
        "```\n",
        "\n",
        "## Second\n",
        "\n",
        "```\n",
        "PER: \t\t\t19.1876\n",
        "HOMOGRAPH: \t\t29.7170\n",
        "TOTAL TIME:\t\t11208.3624 (s)\n",
        "AVG TIME:\t\t28.0209 (s)+\n",
        "```\n",
        "\n",
        "## Third\n",
        "\n",
        "```\n",
        "PER: \t\t\t21.4541\n",
        "HOMOGRAPH: \t\t29.2453\n",
        "TOTAL TIME:\t\t11237.2792 (s)\n",
        "AVG TIME:\t\t28.0932 (s)+\n",
        "```\n",
        "\n",
        "## Fourth\n",
        "\n",
        "```\n",
        "PER: \t\t\t19.4678\n",
        "HOMOGRAPH: \t\t29.7170\n",
        "TOTAL TIME:\t\t11432.3679 (s)\n",
        "AVG TIME:\t\t28.5809 (s)+\n",
        "```\n",
        "\n",
        "## Fifth\n",
        "\n",
        "```\n",
        "PER: \t\t\t16.8950\n",
        "HOMOGRAPH: \t\t31.1321\n",
        "TOTAL TIME:\t\t11167.3797 (s)\n",
        "AVG TIME:\t\t27.9184 (s)+\n",
        "```"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}