Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_PasaOpasen_PersianG2P.ipynb 52KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": [],
  7. "collapsed_sections": [
  8. "EOZGZa2lMfPe",
  9. "VtxEYym69RUH",
  10. "AdU8VMTIOWLZ",
  11. "a3zuvbqx2l68",
  12. "R6PE5ds45TPr",
  13. "JGEUIrbi9kNH"
  14. ]
  15. },
  16. "kernelspec": {
  17. "name": "python3",
  18. "display_name": "Python 3"
  19. },
  20. "language_info": {
  21. "name": "python"
  22. }
  23. },
  24. "cells": [
  25. {
  26. "cell_type": "markdown",
  27. "metadata": {
  28. "id": "WEY5MiKLzurH"
  29. },
  30. "source": [
  31. "# Setup Environment"
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "source": [
  37. "! pip install PersianG2p"
  38. ],
  39. "metadata": {
  40. "id": "JJ9VISNqZS6m",
  41. "colab": {
  42. "base_uri": "https://localhost:8080/",
  43. "height": 1000
  44. },
  45. "outputId": "49fc5536-24cc-44d2-8a9d-47c156afa433"
  46. },
  47. "execution_count": null,
  48. "outputs": [
  49. {
  50. "output_type": "stream",
  51. "name": "stdout",
  52. "text": [
  53. "Collecting PersianG2p\n",
  54. " Downloading PersianG2p-0.3.2-py3-none-any.whl.metadata (6.5 kB)\n",
  55. "Collecting hazm (from PersianG2p)\n",
  56. " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
  57. "Requirement already satisfied: inflect in /usr/local/lib/python3.11/dist-packages (from PersianG2p) (7.5.0)\n",
  58. "Collecting num2fawords (from PersianG2p)\n",
  59. " Downloading num2fawords-1.1-py3-none-any.whl.metadata (4.1 kB)\n",
  60. "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from PersianG2p) (2.0.2)\n",
  61. "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm->PersianG2p)\n",
  62. " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
  63. "Collecting flashtext<3.0,>=2.7 (from hazm->PersianG2p)\n",
  64. " Downloading flashtext-2.7.tar.gz (14 kB)\n",
  65. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  66. "Collecting gensim<5.0.0,>=4.3.1 (from hazm->PersianG2p)\n",
  67. " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
  68. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm->PersianG2p) (3.9.1)\n",
  69. "Collecting numpy (from PersianG2p)\n",
  70. " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
  71. "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm->PersianG2p)\n",
  72. " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
  73. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm->PersianG2p) (1.6.1)\n",
  74. "Requirement already satisfied: more_itertools>=8.5.0 in /usr/local/lib/python3.11/dist-packages (from inflect->PersianG2p) (10.7.0)\n",
  75. "Requirement already satisfied: typeguard>=4.0.1 in /usr/local/lib/python3.11/dist-packages (from inflect->PersianG2p) (4.4.2)\n",
  76. "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm->PersianG2p)\n",
  77. " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
  78. "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm->PersianG2p) (75.2.0)\n",
  79. "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm->PersianG2p)\n",
  80. " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
  81. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  82. "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm->PersianG2p) (7.1.0)\n",
  83. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->PersianG2p) (8.1.8)\n",
  84. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->PersianG2p) (1.4.2)\n",
  85. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->PersianG2p) (2024.11.6)\n",
  86. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->PersianG2p) (4.67.1)\n",
  87. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm->PersianG2p) (3.6.0)\n",
  88. "Requirement already satisfied: typing_extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from typeguard>=4.0.1->inflect->PersianG2p) (4.13.2)\n",
  89. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm->PersianG2p) (1.17.2)\n",
  90. "Downloading PersianG2p-0.3.2-py3-none-any.whl (928 kB)\n",
  91. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m928.1/928.1 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  92. "\u001b[?25hDownloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
  93. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  94. "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  95. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m31.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  96. "\u001b[?25hDownloading num2fawords-1.1-py3-none-any.whl (9.8 kB)\n",
  97. "Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
  98. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m55.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  99. "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
  100. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  101. "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
  102. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  103. "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
  104. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  105. "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
  106. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  107. "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
  108. " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  109. " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=bbdfef47ecf07814b056f2477b78913068fd94f3b79fbe865726138fe5a4ddc1\n",
  110. " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
  111. "Successfully built flashtext\n",
  112. "Installing collected packages: num2fawords, flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm, PersianG2p\n",
  113. " Attempting uninstall: numpy\n",
  114. " Found existing installation: numpy 2.0.2\n",
  115. " Uninstalling numpy-2.0.2:\n",
  116. " Successfully uninstalled numpy-2.0.2\n",
  117. " Attempting uninstall: scipy\n",
  118. " Found existing installation: scipy 1.15.2\n",
  119. " Uninstalling scipy-1.15.2:\n",
  120. " Successfully uninstalled scipy-1.15.2\n",
  121. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  122. "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
  123. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
  124. "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
  125. "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
  126. "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  127. "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  128. "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
  129. "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
  130. "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  131. "\u001b[0mSuccessfully installed PersianG2p-0.3.2 fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 num2fawords-1.1 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
  132. ]
  133. },
  134. {
  135. "output_type": "display_data",
  136. "data": {
  137. "application/vnd.colab-display-data+json": {
  138. "pip_warning": {
  139. "packages": [
  140. "numpy"
  141. ]
  142. },
  143. "id": "64c42c29e8e04317a2d190a1cd236293"
  144. }
  145. },
  146. "metadata": {}
  147. }
  148. ]
  149. },
  150. {
  151. "cell_type": "code",
  152. "source": [
  153. "! pip install jiwer"
  154. ],
  155. "metadata": {
  156. "colab": {
  157. "base_uri": "https://localhost:8080/"
  158. },
  159. "id": "u-Wc6za7ejMs",
  160. "outputId": "1a08f250-f5a8-4d78-a476-a0196241694c"
  161. },
  162. "execution_count": null,
  163. "outputs": [
  164. {
  165. "output_type": "stream",
  166. "name": "stdout",
  167. "text": [
  168. "Collecting jiwer\n",
  169. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  170. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  171. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  172. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  173. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  174. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  175. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  176. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  177. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  178. ]
  179. }
  180. ]
  181. },
  182. {
  183. "cell_type": "code",
  184. "source": [
  185. "import os\n",
  186. "import re\n",
  187. "from tqdm import tqdm\n",
  188. "import csv\n",
  189. "import pandas as pd\n",
  190. "import json\n",
  191. "import itertools\n",
  192. "from jiwer import cer"
  193. ],
  194. "metadata": {
  195. "id": "xDqy82HJeoui"
  196. },
  197. "execution_count": null,
  198. "outputs": []
  199. },
  200. {
  201. "cell_type": "markdown",
  202. "source": [
  203. "# Setup Model"
  204. ],
  205. "metadata": {
  206. "id": "Y3BIUX2depmo"
  207. }
  208. },
  209. {
  210. "cell_type": "code",
  211. "source": [
  212. "from PersianG2p import Persian_g2p_converter\n",
  213. "\n",
  214. "PersianG2Pconverter = Persian_g2p_converter()"
  215. ],
  216. "metadata": {
  217. "id": "dOhI9I0TZPX-"
  218. },
  219. "execution_count": null,
  220. "outputs": []
  221. },
  222. {
  223. "cell_type": "code",
  224. "source": [
  225. "PersianG2Pconverter.transliterate('دلم میخواست برم ', tidy = False, secret = True)"
  226. ],
  227. "metadata": {
  228. "colab": {
  229. "base_uri": "https://localhost:8080/",
  230. "height": 35
  231. },
  232. "id": "K2tlspKhZz7m",
  233. "outputId": "3e42869a-1b12-4c20-cab9-1cbda35711ca"
  234. },
  235. "execution_count": null,
  236. "outputs": [
  237. {
  238. "output_type": "execute_result",
  239. "data": {
  240. "text/plain": [
  241. "'dalam mixAst beram'"
  242. ],
  243. "application/vnd.google.colaboratory.intrinsic+json": {
  244. "type": "string"
  245. }
  246. },
  247. "metadata": {},
  248. "execution_count": 4
  249. }
  250. ]
  251. },
  252. {
  253. "cell_type": "code",
  254. "source": [
  255. "PersianG2Pconverter.transliterate('دلم میخواست برم ', tidy = True, secret = True)"
  256. ],
  257. "metadata": {
  258. "colab": {
  259. "base_uri": "https://localhost:8080/",
  260. "height": 35
  261. },
  262. "id": "qhZVmJ1hZ7V4",
  263. "outputId": "dcbfa84e-97e3-4ec9-b906-5f29be545c11"
  264. },
  265. "execution_count": null,
  266. "outputs": [
  267. {
  268. "output_type": "execute_result",
  269. "data": {
  270. "text/plain": [
  271. "'dalam mixāst beram'"
  272. ],
  273. "application/vnd.google.colaboratory.intrinsic+json": {
  274. "type": "string"
  275. }
  276. },
  277. "metadata": {},
  278. "execution_count": 5
  279. }
  280. ]
  281. },
  282. {
  283. "cell_type": "code",
  284. "source": [
  285. "PersianG2Pconverter.transliterate('انجمن نابینایان برای افرادی که تمایل به شنیدن مجله‌ی نسل مانا را دارند، این امکان را فراهم کرده‌است.', tidy = False, secret = True)"
  286. ],
  287. "metadata": {
  288. "colab": {
  289. "base_uri": "https://localhost:8080/",
  290. "height": 35
  291. },
  292. "id": "24-8fAuZaAxf",
  293. "outputId": "a1d4d066-e1ac-4e35-85fb-fbce079cd422"
  294. },
  295. "execution_count": null,
  296. "outputs": [
  297. {
  298. "output_type": "execute_result",
  299. "data": {
  300. "text/plain": [
  301. "'anjoman nAbinA?An barA^ye efrAdi ke tamAyol be Senidan majele?i nasl mAnA rA dArand ، in emkAn rA farAham kerdedest .'"
  302. ],
  303. "application/vnd.google.colaboratory.intrinsic+json": {
  304. "type": "string"
  305. }
  306. },
  307. "metadata": {},
  308. "execution_count": 6
  309. }
  310. ]
  311. },
  312. {
  313. "cell_type": "markdown",
  314. "source": [
  315. "# mapping"
  316. ],
  317. "metadata": {
  318. "id": "VtxEYym69RUH"
  319. }
  320. },
  321. {
  322. "cell_type": "code",
  323. "source": [
  324. "output_to_phonetics_map = {\n",
  325. " 'м': 'm',\n",
  326. " 'ʷ':' v',\n",
  327. " 'w': 'v',\n",
  328. " 'c': 'k',\n",
  329. " 'ĉ': 'C',\n",
  330. " 'č': 'C',\n",
  331. " '̕': \"?\",\n",
  332. " \"'\": '?',\n",
  333. " 'ʔ': \"?\",\n",
  334. " 'ꞌ': \"?\",\n",
  335. " '̛': \"?\",\n",
  336. " '’': \"?\",\n",
  337. " 'ʼ': \"?\",\n",
  338. " \"'\": '?',\n",
  339. " 'â': 'A',\n",
  340. " 'â': 'A',\n",
  341. " 'ȃ': 'A',\n",
  342. " 'ž': 'Z',\n",
  343. " 'š': 'S',\n",
  344. " 'W': 'v',\n",
  345. " 'β': 'f',\n",
  346. " 'е': 'e',\n",
  347. " '`': \"?\",\n",
  348. " 'ɑ': 'A',\n",
  349. " 'ɑ': 'A',\n",
  350. " 'ʃ': 'S',\n",
  351. " 'ð': 'z',\n",
  352. " 'ɾ': 'r',\n",
  353. " 'æ': 'a',\n",
  354. " 'ɪ': 'e',\n",
  355. " 'χ': 'x',\n",
  356. " 'ɣ': 'q',\n",
  357. " 'ʒ': 'Z',\n",
  358. " ':': '',\n",
  359. " 'ː': '',\n",
  360. " 'ā': 'A',\n",
  361. " 'ː': '',\n",
  362. " 'ä': 'A',\n",
  363. " 'á': 'A',\n",
  364. " 'š': 'S',\n",
  365. " 'ū': 'u',\n",
  366. " 'û': 'u',\n",
  367. " 'ś': 's',\n",
  368. " 'ī': 'i',\n",
  369. " 'í': 'i',\n",
  370. " 'î': 'i',\n",
  371. " 'é': 'e',\n",
  372. " 'ḥ': 'h',\n",
  373. " 'ɒ': 'A',\n",
  374. " 'ʰ': '',\n",
  375. " 'ə': 'e',\n",
  376. " 'R': 'r',\n",
  377. " 'W': 'v',\n",
  378. " 'Q': 'q',\n",
  379. " 'T': 't',\n",
  380. " 'Y': 'y',\n",
  381. " 'P': 'p',\n",
  382. " 'D': 'd',\n",
  383. " 'F': 'f',\n",
  384. " 'H': 'h',\n",
  385. " 'J': 'j',\n",
  386. " 'L': 'l',\n",
  387. " 'X': 'x',\n",
  388. " 'V': 'v',\n",
  389. " 'B': 'b',\n",
  390. " 'N': 'n',\n",
  391. " 'M': 'm',\n",
  392. " 'K': 'k',\n",
  393. " 'G': 'g',\n",
  394. " 'U': 'u',\n",
  395. " 'O': 'o',\n",
  396. " 'I': 'i',\n",
  397. " 'E': 'e',\n",
  398. " 'ŋ': 'ng',\n",
  399. " '.': '',\n",
  400. " 'ɛ': 'e',\n",
  401. " 'ʊ': 'u',\n",
  402. " \"ˈ\": '?',\n",
  403. " 'ù': 'u',\n",
  404. " 'θ': 's',\n",
  405. " '̪': '',\n",
  406. " 'ũ': 'u',\n",
  407. " '_': '',\n",
  408. " 'ç': 'C',\n",
  409. " 'ĝ': 'q',\n",
  410. " 'ɢ': 'q',\n",
  411. " 'ː': '',\n",
  412. " 'í': 'i',\n",
  413. " 'ŝ': 'S',\n",
  414. " '!': '',\n",
  415. " 'ǧ': 'q',\n",
  416. " 'ʻ': '?',\n",
  417. " 'è': 'e',\n",
  418. " '�': '',\n",
  419. " 'ú': 'u',\n",
  420. " 'ô': 'o',\n",
  421. " 'ē': 'e',\n",
  422. " 'à': 'A',\n",
  423. " 'ă': 'A',\n",
  424. " 'ǐ': 'i',\n",
  425. " 'ü': 'u',\n",
  426. " '\\u200e': '',\n",
  427. " 'ğ': 'q',\n",
  428. " 'ṣ': 'S',\n",
  429. " 'â': 'A',\n",
  430. " 'â': 'A',\n",
  431. " 'ȃ': 'A',\n",
  432. " 'ž': 'Z',\n",
  433. " 'š': 'S',\n",
  434. " 'ā': 'A',\n",
  435. " 'ː': '',\n",
  436. " 'ä': 'A',\n",
  437. " 'á': 'A',\n",
  438. " 'š': 'S',\n",
  439. " 'ū': 'u',\n",
  440. " 'û': 'u',\n",
  441. " 'ś': 'S',\n",
  442. " 'ī': 'i',\n",
  443. " 'í': 'i',\n",
  444. " 'î': 'i',\n",
  445. " 'é': 'e',\n",
  446. "}\n",
  447. "\n",
  448. "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n",
  449. "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n",
  450. "\n",
  451. "\n",
  452. "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n",
  453. " substituted = re.sub(r'tʃʰ', 'C', input_string)\n",
  454. " substituted = re.sub(r't͡ʃ', 'C', input_string)\n",
  455. " substituted = re.sub(r'tʃ', 'C', substituted)\n",
  456. " substituted = re.sub(r't͡S', 'C', substituted)\n",
  457. " substituted = re.sub(r'ow', 'o', substituted)\n",
  458. " substituted = re.sub('d͡ʒ', 'j', substituted)\n",
  459. " substituted = re.sub('dʒ', 'j', substituted)\n",
  460. "\n",
  461. " # Create a translation table using str.maketrans\n",
  462. " translation_table = str.maketrans(char_map)\n",
  463. "\n",
  464. " # Use str.translate to replace characters based on the translation table\n",
  465. " translated = substituted.translate(translation_table)\n",
  466. "\n",
  467. " return translated"
  468. ],
  469. "metadata": {
  470. "id": "TKx8oA1n7rKh"
  471. },
  472. "execution_count": null,
  473. "outputs": []
  474. },
  475. {
  476. "cell_type": "markdown",
  477. "metadata": {
  478. "id": "XjAPkfq7SF87"
  479. },
  480. "source": [
  481. "# Get Evaluation Data"
  482. ]
  483. },
  484. {
  485. "cell_type": "code",
  486. "source": [
  487. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  488. ],
  489. "metadata": {
  490. "id": "qwCG0jX-88nQ",
  491. "colab": {
  492. "base_uri": "https://localhost:8080/"
  493. },
  494. "outputId": "873c1f69-e2b4-4363-c7a5-f38b47513659"
  495. },
  496. "execution_count": null,
  497. "outputs": [
  498. {
  499. "output_type": "stream",
  500. "name": "stdout",
  501. "text": [
  502. "--2025-05-10 11:45:09-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  503. "Resolving huggingface.co (huggingface.co)... 13.226.52.8, 13.226.52.35, 13.226.52.100, ...\n",
  504. "Connecting to huggingface.co (huggingface.co)|13.226.52.8|:443... connected.\n",
  505. "HTTP request sent, awaiting response... 200 OK\n",
  506. "Length: 56026 (55K) [text/plain]\n",
  507. "Saving to: ‘SentenceBench.csv’\n",
  508. "\n",
  509. "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.01s \n",
  510. "\n",
  511. "2025-05-10 11:45:09 (4.13 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  512. "\n"
  513. ]
  514. }
  515. ]
  516. },
  517. {
  518. "cell_type": "code",
  519. "source": [
  520. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  521. ],
  522. "metadata": {
  523. "id": "hJO-UAPDQvcb"
  524. },
  525. "execution_count": null,
  526. "outputs": []
  527. },
  528. {
  529. "cell_type": "code",
  530. "source": [
  531. "sentence_bench.head(3)"
  532. ],
  533. "metadata": {
  534. "colab": {
  535. "base_uri": "https://localhost:8080/",
  536. "height": 143
  537. },
  538. "id": "qlYbrnUa9LAN",
  539. "outputId": "e27fedf6-570c-4a31-8d6b-84c64b23dd90"
  540. },
  541. "execution_count": null,
  542. "outputs": [
  543. {
  544. "output_type": "execute_result",
  545. "data": {
  546. "text/plain": [
  547. " dataset grapheme \\\n",
  548. "0 homograph من قدر تو را می‌دانم \n",
  549. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  550. "2 homograph به دست و صورتم کرم زدم \n",
  551. "\n",
  552. " phoneme homograph word \\\n",
  553. "0 man qadr-e to rA mi-dAnam قدر \n",
  554. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  555. "2 be dast-o suratam kerem zadam کرم \n",
  556. "\n",
  557. " pronunciation \n",
  558. "0 qadr \n",
  559. "1 qadar \n",
  560. "2 kerem "
  561. ],
  562. "text/html": [
  563. "\n",
  564. " <div id=\"df-2f9a2ff8-5fad-4b3d-a3e6-12372061610b\" class=\"colab-df-container\">\n",
  565. " <div>\n",
  566. "<style scoped>\n",
  567. " .dataframe tbody tr th:only-of-type {\n",
  568. " vertical-align: middle;\n",
  569. " }\n",
  570. "\n",
  571. " .dataframe tbody tr th {\n",
  572. " vertical-align: top;\n",
  573. " }\n",
  574. "\n",
  575. " .dataframe thead th {\n",
  576. " text-align: right;\n",
  577. " }\n",
  578. "</style>\n",
  579. "<table border=\"1\" class=\"dataframe\">\n",
  580. " <thead>\n",
  581. " <tr style=\"text-align: right;\">\n",
  582. " <th></th>\n",
  583. " <th>dataset</th>\n",
  584. " <th>grapheme</th>\n",
  585. " <th>phoneme</th>\n",
  586. " <th>homograph word</th>\n",
  587. " <th>pronunciation</th>\n",
  588. " </tr>\n",
  589. " </thead>\n",
  590. " <tbody>\n",
  591. " <tr>\n",
  592. " <th>0</th>\n",
  593. " <td>homograph</td>\n",
  594. " <td>من قدر تو را می‌دانم</td>\n",
  595. " <td>man qadr-e to rA mi-dAnam</td>\n",
  596. " <td>قدر</td>\n",
  597. " <td>qadr</td>\n",
  598. " </tr>\n",
  599. " <tr>\n",
  600. " <th>1</th>\n",
  601. " <td>homograph</td>\n",
  602. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  603. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  604. " <td>قدر</td>\n",
  605. " <td>qadar</td>\n",
  606. " </tr>\n",
  607. " <tr>\n",
  608. " <th>2</th>\n",
  609. " <td>homograph</td>\n",
  610. " <td>به دست و صورتم کرم زدم</td>\n",
  611. " <td>be dast-o suratam kerem zadam</td>\n",
  612. " <td>کرم</td>\n",
  613. " <td>kerem</td>\n",
  614. " </tr>\n",
  615. " </tbody>\n",
  616. "</table>\n",
  617. "</div>\n",
  618. " <div class=\"colab-df-buttons\">\n",
  619. "\n",
  620. " <div class=\"colab-df-container\">\n",
  621. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2f9a2ff8-5fad-4b3d-a3e6-12372061610b')\"\n",
  622. " title=\"Convert this dataframe to an interactive table.\"\n",
  623. " style=\"display:none;\">\n",
  624. "\n",
  625. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  626. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  627. " </svg>\n",
  628. " </button>\n",
  629. "\n",
  630. " <style>\n",
  631. " .colab-df-container {\n",
  632. " display:flex;\n",
  633. " gap: 12px;\n",
  634. " }\n",
  635. "\n",
  636. " .colab-df-convert {\n",
  637. " background-color: #E8F0FE;\n",
  638. " border: none;\n",
  639. " border-radius: 50%;\n",
  640. " cursor: pointer;\n",
  641. " display: none;\n",
  642. " fill: #1967D2;\n",
  643. " height: 32px;\n",
  644. " padding: 0 0 0 0;\n",
  645. " width: 32px;\n",
  646. " }\n",
  647. "\n",
  648. " .colab-df-convert:hover {\n",
  649. " background-color: #E2EBFA;\n",
  650. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  651. " fill: #174EA6;\n",
  652. " }\n",
  653. "\n",
  654. " .colab-df-buttons div {\n",
  655. " margin-bottom: 4px;\n",
  656. " }\n",
  657. "\n",
  658. " [theme=dark] .colab-df-convert {\n",
  659. " background-color: #3B4455;\n",
  660. " fill: #D2E3FC;\n",
  661. " }\n",
  662. "\n",
  663. " [theme=dark] .colab-df-convert:hover {\n",
  664. " background-color: #434B5C;\n",
  665. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  666. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  667. " fill: #FFFFFF;\n",
  668. " }\n",
  669. " </style>\n",
  670. "\n",
  671. " <script>\n",
  672. " const buttonEl =\n",
  673. " document.querySelector('#df-2f9a2ff8-5fad-4b3d-a3e6-12372061610b button.colab-df-convert');\n",
  674. " buttonEl.style.display =\n",
  675. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  676. "\n",
  677. " async function convertToInteractive(key) {\n",
  678. " const element = document.querySelector('#df-2f9a2ff8-5fad-4b3d-a3e6-12372061610b');\n",
  679. " const dataTable =\n",
  680. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  681. " [key], {});\n",
  682. " if (!dataTable) return;\n",
  683. "\n",
  684. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  685. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  686. " + ' to learn more about interactive tables.';\n",
  687. " element.innerHTML = '';\n",
  688. " dataTable['output_type'] = 'display_data';\n",
  689. " await google.colab.output.renderOutput(dataTable, element);\n",
  690. " const docLink = document.createElement('div');\n",
  691. " docLink.innerHTML = docLinkHtml;\n",
  692. " element.appendChild(docLink);\n",
  693. " }\n",
  694. " </script>\n",
  695. " </div>\n",
  696. "\n",
  697. "\n",
  698. " <div id=\"df-de5a023e-64df-4224-8df0-57191f788f74\">\n",
  699. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-de5a023e-64df-4224-8df0-57191f788f74')\"\n",
  700. " title=\"Suggest charts\"\n",
  701. " style=\"display:none;\">\n",
  702. "\n",
  703. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  704. " width=\"24px\">\n",
  705. " <g>\n",
  706. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  707. " </g>\n",
  708. "</svg>\n",
  709. " </button>\n",
  710. "\n",
  711. "<style>\n",
  712. " .colab-df-quickchart {\n",
  713. " --bg-color: #E8F0FE;\n",
  714. " --fill-color: #1967D2;\n",
  715. " --hover-bg-color: #E2EBFA;\n",
  716. " --hover-fill-color: #174EA6;\n",
  717. " --disabled-fill-color: #AAA;\n",
  718. " --disabled-bg-color: #DDD;\n",
  719. " }\n",
  720. "\n",
  721. " [theme=dark] .colab-df-quickchart {\n",
  722. " --bg-color: #3B4455;\n",
  723. " --fill-color: #D2E3FC;\n",
  724. " --hover-bg-color: #434B5C;\n",
  725. " --hover-fill-color: #FFFFFF;\n",
  726. " --disabled-bg-color: #3B4455;\n",
  727. " --disabled-fill-color: #666;\n",
  728. " }\n",
  729. "\n",
  730. " .colab-df-quickchart {\n",
  731. " background-color: var(--bg-color);\n",
  732. " border: none;\n",
  733. " border-radius: 50%;\n",
  734. " cursor: pointer;\n",
  735. " display: none;\n",
  736. " fill: var(--fill-color);\n",
  737. " height: 32px;\n",
  738. " padding: 0;\n",
  739. " width: 32px;\n",
  740. " }\n",
  741. "\n",
  742. " .colab-df-quickchart:hover {\n",
  743. " background-color: var(--hover-bg-color);\n",
  744. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  745. " fill: var(--button-hover-fill-color);\n",
  746. " }\n",
  747. "\n",
  748. " .colab-df-quickchart-complete:disabled,\n",
  749. " .colab-df-quickchart-complete:disabled:hover {\n",
  750. " background-color: var(--disabled-bg-color);\n",
  751. " fill: var(--disabled-fill-color);\n",
  752. " box-shadow: none;\n",
  753. " }\n",
  754. "\n",
  755. " .colab-df-spinner {\n",
  756. " border: 2px solid var(--fill-color);\n",
  757. " border-color: transparent;\n",
  758. " border-bottom-color: var(--fill-color);\n",
  759. " animation:\n",
  760. " spin 1s steps(1) infinite;\n",
  761. " }\n",
  762. "\n",
  763. " @keyframes spin {\n",
  764. " 0% {\n",
  765. " border-color: transparent;\n",
  766. " border-bottom-color: var(--fill-color);\n",
  767. " border-left-color: var(--fill-color);\n",
  768. " }\n",
  769. " 20% {\n",
  770. " border-color: transparent;\n",
  771. " border-left-color: var(--fill-color);\n",
  772. " border-top-color: var(--fill-color);\n",
  773. " }\n",
  774. " 30% {\n",
  775. " border-color: transparent;\n",
  776. " border-left-color: var(--fill-color);\n",
  777. " border-top-color: var(--fill-color);\n",
  778. " border-right-color: var(--fill-color);\n",
  779. " }\n",
  780. " 40% {\n",
  781. " border-color: transparent;\n",
  782. " border-right-color: var(--fill-color);\n",
  783. " border-top-color: var(--fill-color);\n",
  784. " }\n",
  785. " 60% {\n",
  786. " border-color: transparent;\n",
  787. " border-right-color: var(--fill-color);\n",
  788. " }\n",
  789. " 80% {\n",
  790. " border-color: transparent;\n",
  791. " border-right-color: var(--fill-color);\n",
  792. " border-bottom-color: var(--fill-color);\n",
  793. " }\n",
  794. " 90% {\n",
  795. " border-color: transparent;\n",
  796. " border-bottom-color: var(--fill-color);\n",
  797. " }\n",
  798. " }\n",
  799. "</style>\n",
  800. "\n",
  801. " <script>\n",
  802. " async function quickchart(key) {\n",
  803. " const quickchartButtonEl =\n",
  804. " document.querySelector('#' + key + ' button');\n",
  805. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  806. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  807. " try {\n",
  808. " const charts = await google.colab.kernel.invokeFunction(\n",
  809. " 'suggestCharts', [key], {});\n",
  810. " } catch (error) {\n",
  811. " console.error('Error during call to suggestCharts:', error);\n",
  812. " }\n",
  813. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  814. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  815. " }\n",
  816. " (() => {\n",
  817. " let quickchartButtonEl =\n",
  818. " document.querySelector('#df-de5a023e-64df-4224-8df0-57191f788f74 button');\n",
  819. " quickchartButtonEl.style.display =\n",
  820. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  821. " })();\n",
  822. " </script>\n",
  823. " </div>\n",
  824. " </div>\n",
  825. " </div>\n"
  826. ],
  827. "application/vnd.google.colaboratory.intrinsic+json": {
  828. "type": "dataframe",
  829. "variable_name": "sentence_bench",
  830. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  831. }
  832. },
  833. "metadata": {},
  834. "execution_count": 10
  835. }
  836. ]
  837. },
  838. {
  839. "cell_type": "markdown",
  840. "metadata": {
  841. "id": "wDV7ysXf2b_H"
  842. },
  843. "source": [
  844. "### Get ManaTTS"
  845. ]
  846. },
  847. {
  848. "cell_type": "code",
  849. "execution_count": null,
  850. "metadata": {
  851. "colab": {
  852. "base_uri": "https://localhost:8080/"
  853. },
  854. "id": "TcL5ZLvSSnVB",
  855. "outputId": "40a6c0bc-e916-4515-ae86-158135f8242d"
  856. },
  857. "outputs": [
  858. {
  859. "output_type": "execute_result",
  860. "data": {
  861. "text/plain": [
  862. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  863. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  864. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  865. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  866. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  867. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  868. ]
  869. },
  870. "metadata": {},
  871. "execution_count": 11
  872. }
  873. ],
  874. "source": [
  875. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  876. "\n",
  877. "# Convert to a list of tuples\n",
  878. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  879. "\n",
  880. "mana_evaluation_data[:3]"
  881. ]
  882. },
  883. {
  884. "cell_type": "markdown",
  885. "metadata": {
  886. "id": "Jjacw9Mp2eoX"
  887. },
  888. "source": [
  889. "### Get CommonVoice"
  890. ]
  891. },
  892. {
  893. "cell_type": "code",
  894. "execution_count": null,
  895. "metadata": {
  896. "id": "-yQnqCGw26sk",
  897. "colab": {
  898. "base_uri": "https://localhost:8080/"
  899. },
  900. "outputId": "d45bb201-06b7-48c5-d153-d6908de6bb2f"
  901. },
  902. "outputs": [
  903. {
  904. "output_type": "execute_result",
  905. "data": {
  906. "text/plain": [
  907. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  908. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  909. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  910. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  911. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  912. ]
  913. },
  914. "metadata": {},
  915. "execution_count": 12
  916. }
  917. ],
  918. "source": [
  919. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  920. "\n",
  921. "# Convert to a list of tuples\n",
  922. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  923. "\n",
  924. "commonvoice_evaluation_data[:3]"
  925. ]
  926. },
  927. {
  928. "cell_type": "markdown",
  929. "metadata": {
  930. "id": "ciSPyhRc3Rvo"
  931. },
  932. "source": [
  933. "### Get Homograph"
  934. ]
  935. },
  936. {
  937. "cell_type": "code",
  938. "execution_count": null,
  939. "metadata": {
  940. "id": "XlFc5JbN3Rvz",
  941. "colab": {
  942. "base_uri": "https://localhost:8080/"
  943. },
  944. "outputId": "4da90100-f3ef-47b4-ad30-98307394d8d9"
  945. },
  946. "outputs": [
  947. {
  948. "output_type": "execute_result",
  949. "data": {
  950. "text/plain": [
  951. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  952. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  953. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  954. " 'قدر',\n",
  955. " 'qadar'),\n",
  956. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  957. ]
  958. },
  959. "metadata": {},
  960. "execution_count": 13
  961. }
  962. ],
  963. "source": [
  964. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  965. "\n",
  966. "# Convert to a list of tuples\n",
  967. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  968. "\n",
  969. "homograph_evaluation_data[:3]"
  970. ]
  971. },
  972. {
  973. "cell_type": "markdown",
  974. "metadata": {
  975. "id": "R6PE5ds45TPr"
  976. },
  977. "source": [
  978. "# Evaluate Method Outputs"
  979. ]
  980. },
  981. {
  982. "cell_type": "markdown",
  983. "metadata": {
  984. "id": "CLKaERek4u_D"
  985. },
  986. "source": [
  987. "## PER Evaluation"
  988. ]
  989. },
  990. {
  991. "cell_type": "code",
  992. "execution_count": null,
  993. "metadata": {
  994. "id": "nBee9xG54u_E"
  995. },
  996. "outputs": [],
  997. "source": [
  998. "def remove_non_word_chars(text):\n",
  999. " pattern = r'[^\\w\\s\\?]'\n",
  1000. " cleaned_text = re.sub(pattern, ' ', text)\n",
  1001. " return cleaned_text"
  1002. ]
  1003. },
  1004. {
  1005. "cell_type": "code",
  1006. "execution_count": null,
  1007. "metadata": {
  1008. "id": "W8PoNV9V4u_E"
  1009. },
  1010. "outputs": [],
  1011. "source": [
  1012. "def remove_white_spaces(text):\n",
  1013. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  1014. " return cleaned_text.strip()"
  1015. ]
  1016. },
  1017. {
  1018. "cell_type": "code",
  1019. "execution_count": null,
  1020. "metadata": {
  1021. "id": "YD0cvnn74u_E"
  1022. },
  1023. "outputs": [],
  1024. "source": [
  1025. "def get_word_only_text(text):\n",
  1026. " word_only_text = remove_non_word_chars(text)\n",
  1027. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  1028. "\n",
  1029. " return extra_space_removed_text"
  1030. ]
  1031. },
  1032. {
  1033. "cell_type": "code",
  1034. "execution_count": null,
  1035. "metadata": {
  1036. "id": "6OQQDual4u_E"
  1037. },
  1038. "outputs": [],
  1039. "source": [
  1040. "def get_texts_cer(reference, model_output):\n",
  1041. " # Preprocess input texts to only contain word characters\n",
  1042. " word_only_reference = get_word_only_text(reference)\n",
  1043. " word_only_output = get_word_only_text(model_output)\n",
  1044. "\n",
  1045. " # Return +infinity for CER if any of the texts is empty\n",
  1046. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  1047. " return float('inf')\n",
  1048. "\n",
  1049. " return cer(word_only_reference, word_only_output)"
  1050. ]
  1051. },
  1052. {
  1053. "cell_type": "code",
  1054. "execution_count": null,
  1055. "metadata": {
  1056. "id": "ncWQnPdW4u_E"
  1057. },
  1058. "outputs": [],
  1059. "source": [
  1060. "def get_avg_cer_of_method(method_outputs, references):\n",
  1061. " cers = []\n",
  1062. " for idx, o in enumerate(method_outputs):\n",
  1063. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  1064. " if cer != float('inf'):\n",
  1065. " cers.append(cer)\n",
  1066. "\n",
  1067. " return sum(cers) / len(cers)"
  1068. ]
  1069. },
  1070. {
  1071. "cell_type": "markdown",
  1072. "source": [
  1073. "## Homograph Evaluation"
  1074. ],
  1075. "metadata": {
  1076. "id": "oBgNtpFQDwku"
  1077. }
  1078. },
  1079. {
  1080. "cell_type": "code",
  1081. "source": [
  1082. "def get_homograph_performance(outputs, references):\n",
  1083. " corrects = 0\n",
  1084. " total = 0\n",
  1085. "\n",
  1086. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  1087. " if homograph != '':\n",
  1088. " total += 1\n",
  1089. " if right in outputs[idx]:\n",
  1090. " corrects += 1\n",
  1091. "\n",
  1092. " return corrects / total"
  1093. ],
  1094. "metadata": {
  1095. "id": "J445ULEvEEDn"
  1096. },
  1097. "execution_count": null,
  1098. "outputs": []
  1099. },
  1100. {
  1101. "cell_type": "markdown",
  1102. "source": [
  1103. "# Full bench"
  1104. ],
  1105. "metadata": {
  1106. "id": "JGEUIrbi9kNH"
  1107. }
  1108. },
  1109. {
  1110. "cell_type": "code",
  1111. "source": [
  1112. "benchmark = []\n",
  1113. "\n",
  1114. "for g, p in mana_evaluation_data:\n",
  1115. " benchmark.append((g, p, '', ''))\n",
  1116. "\n",
  1117. "for g, p in commonvoice_evaluation_data:\n",
  1118. " benchmark.append((g, p, '', ''))\n",
  1119. "\n",
  1120. "for g, p, w, r in homograph_evaluation_data:\n",
  1121. " benchmark.append((g, p, w, r))\n",
  1122. "\n",
  1123. "benchmark = benchmark[:400]"
  1124. ],
  1125. "metadata": {
  1126. "id": "fGzQvL8V9mln"
  1127. },
  1128. "execution_count": null,
  1129. "outputs": []
  1130. },
  1131. {
  1132. "cell_type": "code",
  1133. "source": [
  1134. "def print_all_metrics(predictions):\n",
  1135. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1136. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1137. "\n",
  1138. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1139. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1140. ],
  1141. "metadata": {
  1142. "id": "DpSqE5oPbmAy"
  1143. },
  1144. "execution_count": null,
  1145. "outputs": []
  1146. },
  1147. {
  1148. "cell_type": "markdown",
  1149. "source": [
  1150. "# outputs"
  1151. ],
  1152. "metadata": {
  1153. "id": "Y8oTYWSXJvup"
  1154. }
  1155. },
  1156. {
  1157. "cell_type": "code",
  1158. "source": [
  1159. "from tqdm import tqdm\n",
  1160. "import time\n",
  1161. "\n",
  1162. "outputs = []\n",
  1163. "start_time = time.time()\n",
  1164. "\n",
  1165. "for g, p, _, _ in tqdm(benchmark):\n",
  1166. " o = PersianG2Pconverter.transliterate(g, tidy=False, secret=True)\n",
  1167. " outputs.append(o)\n",
  1168. "\n",
  1169. "total_time = time.time() - start_time\n",
  1170. "avg_time = total_time / len(benchmark) if len(benchmark) > 0 else 0"
  1171. ],
  1172. "metadata": {
  1173. "colab": {
  1174. "base_uri": "https://localhost:8080/"
  1175. },
  1176. "id": "ECW_8Ja5g7FY",
  1177. "outputId": "cb97adcb-d0ff-4151-e8d7-a452a5825d7d"
  1178. },
  1179. "execution_count": null,
  1180. "outputs": [
  1181. {
  1182. "output_type": "stream",
  1183. "name": "stderr",
  1184. "text": [
  1185. "100%|██████████| 400/400 [15:20<00:00, 2.30s/it]\n"
  1186. ]
  1187. }
  1188. ]
  1189. },
  1190. {
  1191. "cell_type": "code",
  1192. "source": [
  1193. "mapped_outputs = []\n",
  1194. "for o in outputs:\n",
  1195. " mapped = replace_phonetic_characters(o)\n",
  1196. " mapped_outputs.append(mapped)"
  1197. ],
  1198. "metadata": {
  1199. "id": "90gAxDT-GMhI"
  1200. },
  1201. "execution_count": null,
  1202. "outputs": []
  1203. },
  1204. {
  1205. "cell_type": "code",
  1206. "source": [
  1207. "print_all_metrics(mapped_outputs)\n",
  1208. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1209. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)\")"
  1210. ],
  1211. "metadata": {
  1212. "id": "zP4Tcj285Ij0",
  1213. "colab": {
  1214. "base_uri": "https://localhost:8080/"
  1215. },
  1216. "outputId": "619843e5-bb25-4b1b-f062-09a9d1c11f6f"
  1217. },
  1218. "execution_count": null,
  1219. "outputs": [
  1220. {
  1221. "output_type": "stream",
  1222. "name": "stdout",
  1223. "text": [
  1224. "PER: \t\t\t15.0414\n",
  1225. "HOMOGRAPH: \t\t37.7358\n",
  1226. "TOTAL TIME:\t\t920.0853 (s)\n",
  1227. "AVG TIME:\t\t2.3002 (s)\n"
  1228. ]
  1229. }
  1230. ]
  1231. },
  1232. {
  1233. "cell_type": "markdown",
  1234. "source": [
  1235. "# Runs\n",
  1236. "\n",
  1237. "## First:\n",
  1238. "\n",
  1239. "```\n",
  1240. "PER: \t\t\t15.0414\n",
  1241. "homograph: \t\t37.7358\n",
  1242. "TOTAL TIME:\t\t874.3154 (s)\n",
  1243. "AVG TIME:\t\t2.1858 (s)\n",
  1244. "```\n",
  1245. "\n",
  1246. "## Second\n",
  1247. "\n",
  1248. "```\n",
  1249. "PER: \t\t\t15.0414\n",
  1250. "homograph: \t\t37.7358\n",
  1251. "TOTAL TIME:\t\t814.6596 (s)\n",
  1252. "AVG TIME:\t\t2.0366 (s)\n",
  1253. "```\n",
  1254. "\n",
  1255. "## Third\n",
  1256. "\n",
  1257. "```\n",
  1258. "PER: \t\t\t15.0414\n",
  1259. "POLYPHONE: \t\t37.7358\n",
  1260. "TOTAL TIME:\t\t845.8805 (s)\n",
  1261. "AVG TIME:\t\t2.1147 (s)\n",
  1262. "```\n",
  1263. "\n",
  1264. "## Fourth\n",
  1265. "\n",
  1266. "```\n",
  1267. "PER: \t\t\t15.0414\n",
  1268. "HOMOGRAPH: \t\t37.7358\n",
  1269. "TOTAL TIME:\t\t882.1829 (s)\n",
  1270. "AVG TIME:\t\t2.2055 (s)\n",
  1271. "```\n",
  1272. "\n",
  1273. "## Fifth\n",
  1274. "\n",
  1275. "```\n",
  1276. "PER: \t\t\t15.0414\n",
  1277. "HOMOGRAPH: \t\t37.7358\n",
  1278. "TOTAL TIME:\t\t920.0853 (s)\n",
  1279. "AVG TIME:\t\t2.3002 (s)\n",
  1280. "```"
  1281. ],
  1282. "metadata": {
  1283. "id": "3vq1em5ElE_Q"
  1284. }
  1285. }
  1286. ]
  1287. }