{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "AdU8VMTIOWLZ", "a3zuvbqx2l68", "XjAPkfq7SF87", "R6PE5ds45TPr", "y73zFlRGIbt9", "oBgNtpFQDwku", "JGEUIrbi9kNH", "fTRgGM_8_Fwg", "jPXWBZ4R_bGs" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "markdown", "source": [ "# Setup Environment" ], "metadata": { "id": "9sEfZoepGP8x" } }, { "cell_type": "code", "source": [ "! pip install hazm==0.10.0" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 770 }, "id": "u6n8Hc1hQSy7", "outputId": "e5448572-c76c-4336-97e0-4e931a1c3940" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: hazm==0.10.0 in /usr/local/lib/python3.11/dist-packages (0.10.0)\n", "Requirement already satisfied: fasttext-wheel<0.10.0,>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (0.9.2)\n", "Requirement already satisfied: flashtext<3.0,>=2.7 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (2.7)\n", "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (4.3.3)\n", "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n", "Collecting numpy==1.24.3 (from hazm==0.10.0)\n", " Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", "Requirement already satisfied: python-crfsuite<0.10.0,>=0.9.9 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (0.9.11)\n", "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n", "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (2.13.6)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n", "Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.13.1)\n", "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.4.2)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n", "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n", "Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", "Installing collected packages: numpy\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 1.26.0\n", " Uninstalling numpy-1.26.0:\n", " Successfully uninstalled numpy-1.26.0\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n", "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n", "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n", "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n", "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n", "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n", "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed numpy-1.24.3\n" ] }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "numpy" ] }, "id": "02f4ece44a3543ebb22ad3f3301874b3" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "!pip install numpy==1.26.0" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iA2Jjex-KMqx", "outputId": "521918bf-2909-4310-c2f8-5774c16a6215" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting numpy==1.26.0\n", " Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n", "Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n", "Installing collected packages: numpy\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 1.24.3\n", " Uninstalling numpy-1.24.3:\n", " Successfully uninstalled numpy-1.24.3\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n", "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed numpy-1.26.0\n" ] } ] }, { "cell_type": "code", "source": [ "from IPython.display import display, HTML\n", "\n", "display(HTML(\"\"\"\n", "
\n", " Please restart the notebook! Click on RuntimeRestart session and then re-run all cells.\n", "
\n", "\"\"\"))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 54 }, "id": "QP854R4YHf4I", "outputId": "5bd9bf27-9215-48da-ee73-24cfe9162e69" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " Please restart the notebook! Click on RuntimeRestart session and then re-run all cells.\n", "
\n" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "!pip install -q --upgrade --no-cache-dir gdown" ], "metadata": { "id": "EVO9pn8Ou3o1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install -q unidecode\n", "!pip install -q transformers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "brKU69ZQvEiz", "outputId": "b4a6bc53-ddc0-4660-909a-5108aa4f7db0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/235.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.4/235.8 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "source": [ "!pip install jiwer" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "grp-l-cbGNWm", "outputId": "58c70285-e3ce-4c56-cb38-3cf7e07d26d1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jiwer\n", " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n", "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n", "Collecting rapidfuzz>=3.9.7 (from jiwer)\n", " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n", "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m33.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n", "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import re\n", "from jiwer import cer" ], "metadata": { "id": "dQ0osefGGSpJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Setup Model" ], "metadata": { "id": "Nwt1YBYVqcva" } }, { "cell_type": "code", "source": [ "!gdown -q 1CrCX8SNhMcmi3KogffFaS4pSaC0t73nJ # The Checkpoint\n", "!unzip -q ./checkpoint-320.zip\n", "!rm ./checkpoint-320.zip" ], "metadata": { "id": "x-kHFEm8u8Xg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!gdown -q 11Yb0QjyP2R3RvN1oSCX9m0DL2_bzDeZS # Parsivar for normalization\n", "!unzip -q ./Parsivar.zip\n", "!rm ./Parsivar.zip" ], "metadata": { "id": "CGVVxGpivULm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "! gdown 1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34 # GE2PE.py" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "K-mPQF5ykcmF", "outputId": "3de14d12-0746-4114-e8ae-c9772e377f31" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading...\n", "From (original): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34\n", "From (redirected): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34&confirm=t&uuid=868f5ff1-a03c-4810-aa3c-acf1dda6dbaf\n", "To: /content/GE2PE.py\n", "\r 0% 0.00/4.96k [00:00] 54.71K --.-KB/s in 0.03s \n", "\n", "2025-05-10 15:31:20 (1.77 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "sentence_bench = pd.read_csv('SentenceBench.csv')" ], "metadata": { "id": "hJO-UAPDQvcb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "sentence_bench.head(3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qlYbrnUa9LAN", "outputId": "d31f0d21-7f88-48b1-daf8-aeac0b15efa5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " dataset grapheme \\\n", "0 homograph من قدر تو را می‌دانم \n", "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n", "2 homograph به دست و صورتم کرم زدم \n", "\n", " phoneme homograph word \\\n", "0 man qadr-e to rA mi-dAnam قدر \n", "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n", "2 be dast-o suratam kerem zadam کرم \n", "\n", " pronunciation \n", "0 qadr \n", "1 qadar \n", "2 kerem " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datasetgraphemephonemehomograph wordpronunciation
0homographمن قدر تو را می‌دانمman qadr-e to rA mi-dAnamقدرqadr
1homographاز قضای الهی به قدر الهی پناه می‌برم?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baramقدرqadar
2homographبه دست و صورتم کرم زدمbe dast-o suratam kerem zadamکرمkerem
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "sentence_bench", "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "wDV7ysXf2b_H" }, "source": [ "### Get ManaTTS" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TcL5ZLvSSnVB", "outputId": "d4bcec8f-566b-4574-d9a4-9d2f893fba98" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n", " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n", " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n", " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n", " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n", " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]" ] }, "metadata": {}, "execution_count": 13 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n", "\n", "# Convert to a list of tuples\n", "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "mana_evaluation_data[:3]" ] }, { "cell_type": "markdown", "metadata": { "id": "Jjacw9Mp2eoX" }, "source": [ "### Get CommonVoice" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-yQnqCGw26sk", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d8c72ac5-e590-4cf3-fcdd-4e14054e323e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n", " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n", " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n", " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n", " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]" ] }, "metadata": {}, "execution_count": 14 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n", "\n", "# Convert to a list of tuples\n", "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "commonvoice_evaluation_data[:3]" ] }, { "cell_type": "markdown", "metadata": { "id": "ciSPyhRc3Rvo" }, "source": [ "### Get Homograph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XlFc5JbN3Rvz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7cf07ce9-3232-4a2c-c5fd-70ad316d79b0" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n", " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n", " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n", " 'قدر',\n", " 'qadar'),\n", " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]" ] }, "metadata": {}, "execution_count": 15 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n", "\n", "# Convert to a list of tuples\n", "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "homograph_evaluation_data[:3]" ] }, { "cell_type": "markdown", "metadata": { "id": "R6PE5ds45TPr" }, "source": [ "# Evaluate Method Outputs" ] }, { "cell_type": "markdown", "metadata": { "id": "y73zFlRGIbt9" }, "source": [ "## PER Evaluation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ItuviO3w5Vzv" }, "outputs": [], "source": [ "def remove_non_word_chars(text):\n", " pattern = r'[^\\w\\s\\?]'\n", " cleaned_text = re.sub(pattern, ' ', text)\n", " return cleaned_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "syQCurXu51TO" }, "outputs": [], "source": [ "def remove_white_spaces(text):\n", " cleaned_text = re.sub(r'\\s+', ' ', text)\n", " return cleaned_text.strip()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V7APkVM053RP" }, "outputs": [], "source": [ "def get_word_only_text(text):\n", " word_only_text = remove_non_word_chars(text)\n", " extra_space_removed_text = remove_white_spaces(word_only_text)\n", "\n", " return extra_space_removed_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ROomKSao57vy" }, "outputs": [], "source": [ "def get_texts_cer(reference, model_output):\n", " # Preprocess input texts to only contain word characters\n", " word_only_reference = get_word_only_text(reference)\n", " word_only_output = get_word_only_text(model_output)\n", "\n", " # Return +infinity for CER if any of the texts is empty\n", " if not word_only_reference.strip() or not word_only_output.strip():\n", " return float('inf')\n", "\n", " return cer(word_only_reference, word_only_output)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4vHLUjp48hc3" }, "outputs": [], "source": [ "def get_avg_cer_of_method(method_outputs, references):\n", " cers = []\n", " for idx, o in enumerate(method_outputs):\n", " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n", " if cer != float('inf'):\n", " cers.append(cer)\n", "\n", " return sum(cers) / len(cers)" ] }, { "cell_type": "markdown", "metadata": { "id": "oBgNtpFQDwku" }, "source": [ "## Homograph Evaluation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "J445ULEvEEDn" }, "outputs": [], "source": [ "def get_homograph_performance(outputs, references):\n", " corrects = 0\n", " total = 0\n", "\n", " for idx, (g, p, homograph, right) in enumerate(references):\n", " if homograph != '':\n", " total += 1\n", " if right in outputs[idx]:\n", " corrects += 1\n", "\n", " return corrects / total" ] }, { "cell_type": "markdown", "metadata": { "id": "JGEUIrbi9kNH" }, "source": [ "# Full bench" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fGzQvL8V9mln" }, "outputs": [], "source": [ "benchmark = []\n", "\n", "for g, p in mana_evaluation_data:\n", " benchmark.append((g, p, '', ''))\n", "\n", "for g, p in commonvoice_evaluation_data:\n", " benchmark.append((g, p, '', ''))\n", "\n", "for g, p, w, r in homograph_evaluation_data:\n", " benchmark.append((g, p, w, r))\n", "\n", "benchmark = benchmark[:400]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4jlXFt8tCPWB" }, "outputs": [], "source": [ "def print_all_metrics(predictions):\n", " per = get_avg_cer_of_method(predictions, benchmark) * 100\n", " # acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)\n", " homograph = get_homograph_performance(predictions, benchmark) * 100\n", "\n", " print(f\"PER: \\t\\t\\t{per:.4f}\")\n", " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")" ] }, { "cell_type": "markdown", "source": [ "# Inference" ], "metadata": { "id": "fTRgGM_8_Fwg" } }, { "cell_type": "code", "source": [ "graphemes = [item[0] for item in benchmark]" ], "metadata": { "id": "17lrgWh__Mzr" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import time\n", "\n", "start_time = time.time()\n", "\n", "outputs = g2p.generate(graphemes, use_rules=True)\n", "\n", "total_time = time.time() - start_time\n", "avg_time = total_time / len(graphemes) if len(graphemes) > 0 else 0" ], "metadata": { "id": "ajqTWtNb_HBd" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Mapping" ], "metadata": { "id": "jPXWBZ4R_bGs" } }, { "cell_type": "code", "source": [ "mapped_outputs = []\n", "\n", "# Define the replacements\n", "replacements = {\n", " 'a': 'A',\n", " '$': 'S',\n", " '/': 'a',\n", " '1': '',\n", " ';': 'Z',\n", " '@': '?',\n", " 'c': 'C'\n", "}\n", "\n", "# Apply replacements\n", "mapped_outputs = [\n", " ''.join(replacements.get(char, char) for char in output)\n", " for output in outputs\n", "]" ], "metadata": { "id": "c8C2sJjJA4na" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Results" ], "metadata": { "id": "JAIAobLFCKCr" } }, { "cell_type": "code", "source": [ "print_all_metrics(mapped_outputs)\n", "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n", "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)+\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CEs_TODaAFHO", "outputId": "e2f49ad9-e667-49b7-ebfc-c2480434a3cc" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t173.3057 (s)\n", "AVG TIME:\t\t0.4333 (s)+\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Runs\n", "\n", "## First:\n", "\n", "```\n", "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t193.6718 (s)\n", "AVG TIME:\t\t0.4842 (s)+\n", "```\n", "\n", "## Second\n", "\n", "```\n", "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t166.1370 (s)\n", "AVG TIME:\t\t0.4153 (s)+\n", "```\n", "\n", "## Third\n", "\n", "```\n", "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t180.0494 (s)\n", "AVG TIME:\t\t0.4501 (s)+\n", "```\n", "\n", "## Fourth\n", "\n", "```\n", "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t179.7101 (s)\n", "AVG TIME:\t\t0.4493 (s)+\n", "```\n", "\n", "## Fifth\n", "\n", "```\n", "PER: \t\t\t4.8063\n", "HOMOGRAPH: \t\t47.1698\n", "TOTAL TIME:\t\t173.3057 (s)\n", "AVG TIME:\t\t0.4333 (s)+\n", "```" ], "metadata": { "id": "DeOaBaWEJI6x" } } ] }