Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_AzamRabiee_Persian_G2P.ipynb 60KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": [],
  7. "collapsed_sections": [
  8. "wDV7ysXf2b_H",
  9. "Jjacw9Mp2eoX",
  10. "ciSPyhRc3Rvo"
  11. ]
  12. },
  13. "kernelspec": {
  14. "name": "python3",
  15. "display_name": "Python 3"
  16. },
  17. "language_info": {
  18. "name": "python"
  19. }
  20. },
  21. "cells": [
  22. {
  23. "cell_type": "markdown",
  24. "metadata": {
  25. "id": "WEY5MiKLzurH"
  26. },
  27. "source": [
  28. "# Setup Environment"
  29. ]
  30. },
  31. {
  32. "cell_type": "code",
  33. "source": [
  34. "! pip install hazm==0.10.0"
  35. ],
  36. "metadata": {
  37. "colab": {
  38. "base_uri": "https://localhost:8080/",
  39. "height": 1000
  40. },
  41. "id": "euO_NTvwG0HW",
  42. "outputId": "18d7ea5b-baeb-4d73-afa2-254ac5642fac"
  43. },
  44. "execution_count": null,
  45. "outputs": [
  46. {
  47. "output_type": "stream",
  48. "name": "stdout",
  49. "text": [
  50. "Collecting hazm==0.10.0\n",
  51. " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
  52. "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n",
  53. " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
  54. "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n",
  55. " Downloading flashtext-2.7.tar.gz (14 kB)\n",
  56. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  57. "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n",
  58. " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
  59. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
  60. "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
  61. " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
  62. "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n",
  63. " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
  64. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
  65. "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n",
  66. " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
  67. "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
  68. "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n",
  69. " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
  70. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  71. "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
  72. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
  73. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.4.2)\n",
  74. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
  75. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
  76. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
  77. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
  78. "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
  79. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  80. "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  81. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  82. "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
  83. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m58.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  84. "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
  85. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  86. "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
  87. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  88. "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
  89. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  90. "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
  91. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  92. "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
  93. " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  94. " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=e7380f6f98ff10f751d96f3b3233a8814bed40dc9fbcb43bace244e15a39a818\n",
  95. " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
  96. "Successfully built flashtext\n",
  97. "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n",
  98. " Attempting uninstall: numpy\n",
  99. " Found existing installation: numpy 2.0.2\n",
  100. " Uninstalling numpy-2.0.2:\n",
  101. " Successfully uninstalled numpy-2.0.2\n",
  102. " Attempting uninstall: scipy\n",
  103. " Found existing installation: scipy 1.15.2\n",
  104. " Uninstalling scipy-1.15.2:\n",
  105. " Successfully uninstalled scipy-1.15.2\n",
  106. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  107. "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
  108. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
  109. "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
  110. "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
  111. "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  112. "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  113. "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
  114. "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
  115. "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  116. "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
  117. ]
  118. },
  119. {
  120. "output_type": "display_data",
  121. "data": {
  122. "application/vnd.colab-display-data+json": {
  123. "pip_warning": {
  124. "packages": [
  125. "numpy"
  126. ]
  127. },
  128. "id": "f860e129e3a34cef9daac243c26d8728"
  129. }
  130. },
  131. "metadata": {}
  132. }
  133. ]
  134. },
  135. {
  136. "cell_type": "code",
  137. "source": [
  138. "!pip install numpy==1.26.0"
  139. ],
  140. "metadata": {
  141. "colab": {
  142. "base_uri": "https://localhost:8080/"
  143. },
  144. "id": "Y2cfyWETIpEf",
  145. "outputId": "5814b320-2ead-4b47-94e6-3fad4d6bd5ee"
  146. },
  147. "execution_count": null,
  148. "outputs": [
  149. {
  150. "output_type": "stream",
  151. "name": "stdout",
  152. "text": [
  153. "Collecting numpy==1.26.0\n",
  154. " Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n",
  155. "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  156. "\u001b[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
  157. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  158. "\u001b[?25hInstalling collected packages: numpy\n",
  159. " Attempting uninstall: numpy\n",
  160. " Found existing installation: numpy 1.24.3\n",
  161. " Uninstalling numpy-1.24.3:\n",
  162. " Successfully uninstalled numpy-1.24.3\n",
  163. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  164. "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n",
  165. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\u001b[0m\u001b[31m\n",
  166. "\u001b[0mSuccessfully installed numpy-1.26.0\n"
  167. ]
  168. }
  169. ]
  170. },
  171. {
  172. "cell_type": "code",
  173. "source": [
  174. "!pip install pandas==2.1.4"
  175. ],
  176. "metadata": {
  177. "colab": {
  178. "base_uri": "https://localhost:8080/"
  179. },
  180. "id": "Qe7BBEZTS7Y6",
  181. "outputId": "acae1624-bc7e-4208-e2f0-80b9e66a18ff"
  182. },
  183. "execution_count": null,
  184. "outputs": [
  185. {
  186. "output_type": "stream",
  187. "name": "stdout",
  188. "text": [
  189. "Collecting pandas==2.1.4\n",
  190. " Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
  191. "Requirement already satisfied: numpy<2,>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (1.26.0)\n",
  192. "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2.9.0.post0)\n",
  193. "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n",
  194. "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.1.4) (2025.2)\n",
  195. "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas==2.1.4) (1.17.0)\n",
  196. "Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n",
  197. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m97.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  198. "\u001b[?25hInstalling collected packages: pandas\n",
  199. " Attempting uninstall: pandas\n",
  200. " Found existing installation: pandas 2.2.2\n",
  201. " Uninstalling pandas-2.2.2:\n",
  202. " Successfully uninstalled pandas-2.2.2\n",
  203. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  204. "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.1.4 which is incompatible.\n",
  205. "plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\n",
  206. "mizani 0.13.3 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.\u001b[0m\u001b[31m\n",
  207. "\u001b[0mSuccessfully installed pandas-2.1.4\n"
  208. ]
  209. }
  210. ]
  211. },
  212. {
  213. "cell_type": "code",
  214. "source": [
  215. "! git clone https://github.com/AzamRabiee/Persian_G2P.git"
  216. ],
  217. "metadata": {
  218. "colab": {
  219. "base_uri": "https://localhost:8080/"
  220. },
  221. "id": "qPAmCjfcUJ_f",
  222. "outputId": "28cb142c-3df0-4f4e-f008-7f29773b6aa6"
  223. },
  224. "execution_count": null,
  225. "outputs": [
  226. {
  227. "output_type": "stream",
  228. "name": "stdout",
  229. "text": [
  230. "Cloning into 'Persian_G2P'...\n",
  231. "remote: Enumerating objects: 35, done.\u001b[K\n",
  232. "remote: Counting objects: 100% (6/6), done.\u001b[K\n",
  233. "remote: Compressing objects: 100% (6/6), done.\u001b[K\n",
  234. "remote: Total 35 (delta 1), reused 0 (delta 0), pack-reused 29 (from 1)\u001b[K\n",
  235. "Receiving objects: 100% (35/35), 614.07 KiB | 3.96 MiB/s, done.\n",
  236. "Resolving deltas: 100% (9/9), done.\n"
  237. ]
  238. }
  239. ]
  240. },
  241. {
  242. "cell_type": "code",
  243. "source": [
  244. "! pip install num2fawords"
  245. ],
  246. "metadata": {
  247. "colab": {
  248. "base_uri": "https://localhost:8080/"
  249. },
  250. "id": "53Wr50lQVFKe",
  251. "outputId": "ab081b4f-3ea1-4448-e68a-c7761cdd554c"
  252. },
  253. "execution_count": null,
  254. "outputs": [
  255. {
  256. "output_type": "stream",
  257. "name": "stdout",
  258. "text": [
  259. "Collecting num2fawords\n",
  260. " Downloading num2fawords-1.1-py3-none-any.whl.metadata (4.1 kB)\n",
  261. "Downloading num2fawords-1.1-py3-none-any.whl (9.8 kB)\n",
  262. "Installing collected packages: num2fawords\n",
  263. "Successfully installed num2fawords-1.1\n"
  264. ]
  265. }
  266. ]
  267. },
  268. {
  269. "cell_type": "code",
  270. "source": [
  271. "! pip install Distance"
  272. ],
  273. "metadata": {
  274. "colab": {
  275. "base_uri": "https://localhost:8080/"
  276. },
  277. "id": "YOYSz85eVPhk",
  278. "outputId": "c69b78f5-f273-49e9-a9a6-ecb37ad63b82"
  279. },
  280. "execution_count": null,
  281. "outputs": [
  282. {
  283. "output_type": "stream",
  284. "name": "stdout",
  285. "text": [
  286. "Collecting Distance\n",
  287. " Downloading Distance-0.1.3.tar.gz (180 kB)\n",
  288. "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/180.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━\u001b[0m \u001b[32m174.1/180.3 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.3/180.3 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  289. "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  290. "Building wheels for collected packages: Distance\n",
  291. " Building wheel for Distance (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  292. " Created wheel for Distance: filename=Distance-0.1.3-py3-none-any.whl size=16256 sha256=4eae7bc18b3a6f86cfdd2471b5159e0257a7276ed17d95f03216b27931d5838e\n",
  293. " Stored in directory: /root/.cache/pip/wheels/fb/cd/9c/3ab5d666e3bcacc58900b10959edd3816cc9557c7337986322\n",
  294. "Successfully built Distance\n",
  295. "Installing collected packages: Distance\n",
  296. "Successfully installed Distance-0.1.3\n"
  297. ]
  298. }
  299. ]
  300. },
  301. {
  302. "cell_type": "code",
  303. "source": [
  304. "! pip install jiwer"
  305. ],
  306. "metadata": {
  307. "colab": {
  308. "base_uri": "https://localhost:8080/"
  309. },
  310. "id": "vMY8rtJX7mwy",
  311. "outputId": "4d8413fd-330f-4517-a52d-dedcca5c6524"
  312. },
  313. "execution_count": null,
  314. "outputs": [
  315. {
  316. "output_type": "stream",
  317. "name": "stdout",
  318. "text": [
  319. "Collecting jiwer\n",
  320. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  321. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  322. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  323. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  324. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  325. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  326. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  327. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  328. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  329. ]
  330. }
  331. ]
  332. },
  333. {
  334. "cell_type": "code",
  335. "source": [
  336. "import os\n",
  337. "import re\n",
  338. "from tqdm import tqdm\n",
  339. "import csv\n",
  340. "import pandas as pd\n",
  341. "import json\n",
  342. "import itertools\n",
  343. "from jiwer import cer"
  344. ],
  345. "metadata": {
  346. "id": "LtiXrEaI7svO"
  347. },
  348. "execution_count": null,
  349. "outputs": []
  350. },
  351. {
  352. "cell_type": "markdown",
  353. "source": [
  354. "# Setup Model"
  355. ],
  356. "metadata": {
  357. "id": "bfqjC8pN7viW"
  358. }
  359. },
  360. {
  361. "cell_type": "code",
  362. "source": [
  363. "! wget https://raw.githubusercontent.com/tihu-nlp/tihudict/master/tihu.demo.dict"
  364. ],
  365. "metadata": {
  366. "colab": {
  367. "base_uri": "https://localhost:8080/"
  368. },
  369. "id": "BlFG8_MSyfHv",
  370. "outputId": "5f6ea297-ee76-4e0c-f855-6fa8fe0bcd3f"
  371. },
  372. "execution_count": null,
  373. "outputs": [
  374. {
  375. "output_type": "stream",
  376. "name": "stdout",
  377. "text": [
  378. "--2025-05-11 10:20:04-- https://raw.githubusercontent.com/tihu-nlp/tihudict/master/tihu.demo.dict\n",
  379. "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
  380. "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
  381. "HTTP request sent, awaiting response... 200 OK\n",
  382. "Length: 49306 (48K) [text/plain]\n",
  383. "Saving to: ‘tihu.demo.dict’\n",
  384. "\n",
  385. "tihu.demo.dict 100%[===================>] 48.15K --.-KB/s in 0.01s \n",
  386. "\n",
  387. "2025-05-11 10:20:04 (3.69 MB/s) - ‘tihu.demo.dict’ saved [49306/49306]\n",
  388. "\n"
  389. ]
  390. }
  391. ]
  392. },
  393. {
  394. "cell_type": "code",
  395. "source": [
  396. "! mv tihu.demo.dict Persian_G2P/tihudict.dict"
  397. ],
  398. "metadata": {
  399. "id": "yGhXkFQQzUUB"
  400. },
  401. "execution_count": null,
  402. "outputs": []
  403. },
  404. {
  405. "cell_type": "code",
  406. "source": [
  407. "import os\n",
  408. "os.chdir('Persian_G2P')"
  409. ],
  410. "metadata": {
  411. "id": "p31NbG4H0jPH"
  412. },
  413. "execution_count": null,
  414. "outputs": []
  415. },
  416. {
  417. "cell_type": "code",
  418. "source": [
  419. "import subprocess\n",
  420. "\n",
  421. "def run_script_with_args(sent):\n",
  422. " try:\n",
  423. " command = [\"python\", \"g2p.py\", \"--text\", sent]\n",
  424. " result = subprocess.run(command, capture_output=True, text=True, )\n",
  425. "\n",
  426. " if result.returncode == 0:\n",
  427. " return result.stdout\n",
  428. " else:\n",
  429. " print(f\"An error occurred: {result.stderr}\")\n",
  430. " return ''\n",
  431. "\n",
  432. " except Exception as e:\n",
  433. " print(f\"An unexpected error occurred: {str(e)}\")"
  434. ],
  435. "metadata": {
  436. "id": "9cuSN2rfYhtb"
  437. },
  438. "execution_count": null,
  439. "outputs": []
  440. },
  441. {
  442. "cell_type": "code",
  443. "source": [
  444. "! python g2p.py --text 'دلم میخواست برم '"
  445. ],
  446. "metadata": {
  447. "colab": {
  448. "base_uri": "https://localhost:8080/"
  449. },
  450. "id": "_BMSg8CcUrK1",
  451. "outputId": "dc26661c-fe1c-474b-ebfb-08d1d8345b3d"
  452. },
  453. "execution_count": null,
  454. "outputs": [
  455. {
  456. "output_type": "stream",
  457. "name": "stdout",
  458. "text": [
  459. "dalam mixAst beram\n"
  460. ]
  461. }
  462. ]
  463. },
  464. {
  465. "cell_type": "code",
  466. "source": [
  467. "! python g2p.py --text 'انجمن نابینایان برای افرادی که تمایل به شنیدن مجله‌ی نسل مانا را دارند، این امکان را فراهم کرده‌است.'"
  468. ],
  469. "metadata": {
  470. "colab": {
  471. "base_uri": "https://localhost:8080/"
  472. },
  473. "id": "UBzBWDKBXzi2",
  474. "outputId": "32ba0590-e392-4d91-ae68-f0547838d288"
  475. },
  476. "execution_count": null,
  477. "outputs": [
  478. {
  479. "output_type": "stream",
  480. "name": "stdout",
  481. "text": [
  482. "a n j o m a n nAbinA?An b a r A ^ y e e f r A d i k e t a m A y o l b e Senidan majele?i n a s l mAnA r A d A r a n d ، i n e m k A n r A f a r A h a m kerdedest .\n"
  483. ]
  484. }
  485. ]
  486. },
  487. {
  488. "cell_type": "markdown",
  489. "source": [
  490. "# mapping"
  491. ],
  492. "metadata": {
  493. "id": "VtxEYym69RUH"
  494. }
  495. },
  496. {
  497. "cell_type": "code",
  498. "source": [
  499. "output_to_phonetics_map = {\n",
  500. " 'м': 'm',\n",
  501. " 'ʷ':' v',\n",
  502. " 'w': 'v',\n",
  503. " 'c': 'k',\n",
  504. " 'ĉ': 'C',\n",
  505. " 'č': 'C',\n",
  506. " '̕': \"?\",\n",
  507. " \"'\": '?',\n",
  508. " 'ʔ': \"?\",\n",
  509. " 'ꞌ': \"?\",\n",
  510. " '̛': \"?\",\n",
  511. " '’': \"?\",\n",
  512. " 'ʼ': \"?\",\n",
  513. " \"'\": '?',\n",
  514. " 'â': 'A',\n",
  515. " 'â': 'A',\n",
  516. " 'ȃ': 'A',\n",
  517. " 'ž': 'Z',\n",
  518. " 'š': 'S',\n",
  519. " 'W': 'v',\n",
  520. " 'β': 'f',\n",
  521. " 'е': 'e',\n",
  522. " '`': \"?\",\n",
  523. " 'ɑ': 'A',\n",
  524. " 'ɑ': 'A',\n",
  525. " 'ʃ': 'S',\n",
  526. " 'ð': 'z',\n",
  527. " 'ɾ': 'r',\n",
  528. " 'æ': 'a',\n",
  529. " 'ɪ': 'e',\n",
  530. " 'χ': 'x',\n",
  531. " 'ɣ': 'q',\n",
  532. " 'ʒ': 'Z',\n",
  533. " ':': '',\n",
  534. " 'ː': '',\n",
  535. " 'ā': 'A',\n",
  536. " 'ː': '',\n",
  537. " 'ä': 'A',\n",
  538. " 'á': 'A',\n",
  539. " 'š': 'S',\n",
  540. " 'ū': 'u',\n",
  541. " 'û': 'u',\n",
  542. " 'ś': 's',\n",
  543. " 'ī': 'i',\n",
  544. " 'í': 'i',\n",
  545. " 'î': 'i',\n",
  546. " 'é': 'e',\n",
  547. " 'ḥ': 'h',\n",
  548. " 'ɒ': 'A',\n",
  549. " 'ʰ': '',\n",
  550. " 'ə': 'e',\n",
  551. " 'R': 'r',\n",
  552. " 'W': 'v',\n",
  553. " 'Q': 'q',\n",
  554. " 'T': 't',\n",
  555. " 'Y': 'y',\n",
  556. " 'P': 'p',\n",
  557. " 'D': 'd',\n",
  558. " 'F': 'f',\n",
  559. " 'H': 'h',\n",
  560. " 'J': 'j',\n",
  561. " 'L': 'l',\n",
  562. " 'X': 'x',\n",
  563. " 'V': 'v',\n",
  564. " 'B': 'b',\n",
  565. " 'N': 'n',\n",
  566. " 'M': 'm',\n",
  567. " 'K': 'k',\n",
  568. " 'G': 'g',\n",
  569. " 'U': 'u',\n",
  570. " 'O': 'o',\n",
  571. " 'I': 'i',\n",
  572. " 'E': 'e',\n",
  573. " 'ŋ': 'ng',\n",
  574. " '.': '',\n",
  575. " 'ɛ': 'e',\n",
  576. " 'ʊ': 'u',\n",
  577. " \"ˈ\": '?',\n",
  578. " 'ù': 'u',\n",
  579. " 'θ': 's',\n",
  580. " '̪': '',\n",
  581. " 'ũ': 'u',\n",
  582. " '_': '',\n",
  583. " 'ç': 'C',\n",
  584. " 'ĝ': 'q',\n",
  585. " 'ɢ': 'q',\n",
  586. " 'ː': '',\n",
  587. " 'í': 'i',\n",
  588. " 'ŝ': 'S',\n",
  589. " '!': '',\n",
  590. " 'ǧ': 'q',\n",
  591. " 'ʻ': '?',\n",
  592. " 'è': 'e',\n",
  593. " '�': '',\n",
  594. " 'ú': 'u',\n",
  595. " 'ô': 'o',\n",
  596. " 'ē': 'e',\n",
  597. " 'à': 'A',\n",
  598. " 'ă': 'A',\n",
  599. " 'ǐ': 'i',\n",
  600. " 'ü': 'u',\n",
  601. " '\\u200e': '',\n",
  602. " 'ğ': 'q',\n",
  603. " 'ṣ': 'S',\n",
  604. " 'â': 'A',\n",
  605. " 'â': 'A',\n",
  606. " 'ȃ': 'A',\n",
  607. " 'ž': 'Z',\n",
  608. " 'š': 'S',\n",
  609. " 'ā': 'A',\n",
  610. " 'ː': '',\n",
  611. " 'ä': 'A',\n",
  612. " 'á': 'A',\n",
  613. " 'š': 'S',\n",
  614. " 'ū': 'u',\n",
  615. " 'û': 'u',\n",
  616. " 'ś': 'S',\n",
  617. " 'ī': 'i',\n",
  618. " 'í': 'i',\n",
  619. " 'î': 'i',\n",
  620. " 'é': 'e',\n",
  621. "}\n",
  622. "\n",
  623. "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n",
  624. "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n",
  625. "\n",
  626. "\n",
  627. "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n",
  628. " substituted = re.sub(r'tʃʰ', 'C', input_string)\n",
  629. " substituted = re.sub(r't͡ʃ', 'C', input_string)\n",
  630. " substituted = re.sub(r'tʃ', 'C', substituted)\n",
  631. " substituted = re.sub(r't͡S', 'C', substituted)\n",
  632. " substituted = re.sub(r'ow', 'o', substituted)\n",
  633. " substituted = re.sub('d͡ʒ', 'j', substituted)\n",
  634. " substituted = re.sub('dʒ', 'j', substituted)\n",
  635. "\n",
  636. " # Create a translation table using str.maketrans\n",
  637. " translation_table = str.maketrans(char_map)\n",
  638. "\n",
  639. " # Use str.translate to replace characters based on the translation table\n",
  640. " translated = substituted.translate(translation_table)\n",
  641. "\n",
  642. " return translated"
  643. ],
  644. "metadata": {
  645. "id": "TKx8oA1n7rKh"
  646. },
  647. "execution_count": null,
  648. "outputs": []
  649. },
  650. {
  651. "cell_type": "markdown",
  652. "metadata": {
  653. "id": "XjAPkfq7SF87"
  654. },
  655. "source": [
  656. "# Get Evaluation Data"
  657. ]
  658. },
  659. {
  660. "cell_type": "code",
  661. "source": [
  662. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  663. ],
  664. "metadata": {
  665. "id": "qwCG0jX-88nQ",
  666. "colab": {
  667. "base_uri": "https://localhost:8080/"
  668. },
  669. "outputId": "ea16b431-5340-458d-b44c-69b62bf49f8d"
  670. },
  671. "execution_count": null,
  672. "outputs": [
  673. {
  674. "output_type": "stream",
  675. "name": "stdout",
  676. "text": [
  677. "--2025-05-11 10:20:36-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  678. "Resolving huggingface.co (huggingface.co)... 18.172.134.124, 18.172.134.4, 18.172.134.88, ...\n",
  679. "Connecting to huggingface.co (huggingface.co)|18.172.134.124|:443... connected.\n",
  680. "HTTP request sent, awaiting response... 200 OK\n",
  681. "Length: 56026 (55K) [text/plain]\n",
  682. "Saving to: ‘SentenceBench.csv’\n",
  683. "\n",
  684. "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.01s \n",
  685. "\n",
  686. "2025-05-11 10:20:36 (4.25 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  687. "\n"
  688. ]
  689. }
  690. ]
  691. },
  692. {
  693. "cell_type": "code",
  694. "source": [
  695. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  696. ],
  697. "metadata": {
  698. "id": "hJO-UAPDQvcb"
  699. },
  700. "execution_count": null,
  701. "outputs": []
  702. },
  703. {
  704. "cell_type": "code",
  705. "source": [
  706. "sentence_bench.head(3)"
  707. ],
  708. "metadata": {
  709. "colab": {
  710. "base_uri": "https://localhost:8080/",
  711. "height": 143
  712. },
  713. "id": "qlYbrnUa9LAN",
  714. "outputId": "4b2b2c89-6aa3-4ba7-e2b3-65f89fcafc66"
  715. },
  716. "execution_count": null,
  717. "outputs": [
  718. {
  719. "output_type": "execute_result",
  720. "data": {
  721. "text/plain": [
  722. " dataset grapheme \\\n",
  723. "0 homograph من قدر تو را می‌دانم \n",
  724. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  725. "2 homograph به دست و صورتم کرم زدم \n",
  726. "\n",
  727. " phoneme homograph word \\\n",
  728. "0 man qadr-e to rA mi-dAnam قدر \n",
  729. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  730. "2 be dast-o suratam kerem zadam کرم \n",
  731. "\n",
  732. " pronunciation \n",
  733. "0 qadr \n",
  734. "1 qadar \n",
  735. "2 kerem "
  736. ],
  737. "text/html": [
  738. "\n",
  739. " <div id=\"df-97945542-d37b-43c4-8780-9624d4f8d9d5\" class=\"colab-df-container\">\n",
  740. " <div>\n",
  741. "<style scoped>\n",
  742. " .dataframe tbody tr th:only-of-type {\n",
  743. " vertical-align: middle;\n",
  744. " }\n",
  745. "\n",
  746. " .dataframe tbody tr th {\n",
  747. " vertical-align: top;\n",
  748. " }\n",
  749. "\n",
  750. " .dataframe thead th {\n",
  751. " text-align: right;\n",
  752. " }\n",
  753. "</style>\n",
  754. "<table border=\"1\" class=\"dataframe\">\n",
  755. " <thead>\n",
  756. " <tr style=\"text-align: right;\">\n",
  757. " <th></th>\n",
  758. " <th>dataset</th>\n",
  759. " <th>grapheme</th>\n",
  760. " <th>phoneme</th>\n",
  761. " <th>homograph word</th>\n",
  762. " <th>pronunciation</th>\n",
  763. " </tr>\n",
  764. " </thead>\n",
  765. " <tbody>\n",
  766. " <tr>\n",
  767. " <th>0</th>\n",
  768. " <td>homograph</td>\n",
  769. " <td>من قدر تو را می‌دانم</td>\n",
  770. " <td>man qadr-e to rA mi-dAnam</td>\n",
  771. " <td>قدر</td>\n",
  772. " <td>qadr</td>\n",
  773. " </tr>\n",
  774. " <tr>\n",
  775. " <th>1</th>\n",
  776. " <td>homograph</td>\n",
  777. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  778. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  779. " <td>قدر</td>\n",
  780. " <td>qadar</td>\n",
  781. " </tr>\n",
  782. " <tr>\n",
  783. " <th>2</th>\n",
  784. " <td>homograph</td>\n",
  785. " <td>به دست و صورتم کرم زدم</td>\n",
  786. " <td>be dast-o suratam kerem zadam</td>\n",
  787. " <td>کرم</td>\n",
  788. " <td>kerem</td>\n",
  789. " </tr>\n",
  790. " </tbody>\n",
  791. "</table>\n",
  792. "</div>\n",
  793. " <div class=\"colab-df-buttons\">\n",
  794. "\n",
  795. " <div class=\"colab-df-container\">\n",
  796. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-97945542-d37b-43c4-8780-9624d4f8d9d5')\"\n",
  797. " title=\"Convert this dataframe to an interactive table.\"\n",
  798. " style=\"display:none;\">\n",
  799. "\n",
  800. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  801. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  802. " </svg>\n",
  803. " </button>\n",
  804. "\n",
  805. " <style>\n",
  806. " .colab-df-container {\n",
  807. " display:flex;\n",
  808. " gap: 12px;\n",
  809. " }\n",
  810. "\n",
  811. " .colab-df-convert {\n",
  812. " background-color: #E8F0FE;\n",
  813. " border: none;\n",
  814. " border-radius: 50%;\n",
  815. " cursor: pointer;\n",
  816. " display: none;\n",
  817. " fill: #1967D2;\n",
  818. " height: 32px;\n",
  819. " padding: 0 0 0 0;\n",
  820. " width: 32px;\n",
  821. " }\n",
  822. "\n",
  823. " .colab-df-convert:hover {\n",
  824. " background-color: #E2EBFA;\n",
  825. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  826. " fill: #174EA6;\n",
  827. " }\n",
  828. "\n",
  829. " .colab-df-buttons div {\n",
  830. " margin-bottom: 4px;\n",
  831. " }\n",
  832. "\n",
  833. " [theme=dark] .colab-df-convert {\n",
  834. " background-color: #3B4455;\n",
  835. " fill: #D2E3FC;\n",
  836. " }\n",
  837. "\n",
  838. " [theme=dark] .colab-df-convert:hover {\n",
  839. " background-color: #434B5C;\n",
  840. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  841. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  842. " fill: #FFFFFF;\n",
  843. " }\n",
  844. " </style>\n",
  845. "\n",
  846. " <script>\n",
  847. " const buttonEl =\n",
  848. " document.querySelector('#df-97945542-d37b-43c4-8780-9624d4f8d9d5 button.colab-df-convert');\n",
  849. " buttonEl.style.display =\n",
  850. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  851. "\n",
  852. " async function convertToInteractive(key) {\n",
  853. " const element = document.querySelector('#df-97945542-d37b-43c4-8780-9624d4f8d9d5');\n",
  854. " const dataTable =\n",
  855. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  856. " [key], {});\n",
  857. " if (!dataTable) return;\n",
  858. "\n",
  859. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  860. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  861. " + ' to learn more about interactive tables.';\n",
  862. " element.innerHTML = '';\n",
  863. " dataTable['output_type'] = 'display_data';\n",
  864. " await google.colab.output.renderOutput(dataTable, element);\n",
  865. " const docLink = document.createElement('div');\n",
  866. " docLink.innerHTML = docLinkHtml;\n",
  867. " element.appendChild(docLink);\n",
  868. " }\n",
  869. " </script>\n",
  870. " </div>\n",
  871. "\n",
  872. "\n",
  873. " <div id=\"df-d4cfcdcc-6f8d-4880-b0b5-afb4b26471f1\">\n",
  874. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-d4cfcdcc-6f8d-4880-b0b5-afb4b26471f1')\"\n",
  875. " title=\"Suggest charts\"\n",
  876. " style=\"display:none;\">\n",
  877. "\n",
  878. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  879. " width=\"24px\">\n",
  880. " <g>\n",
  881. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  882. " </g>\n",
  883. "</svg>\n",
  884. " </button>\n",
  885. "\n",
  886. "<style>\n",
  887. " .colab-df-quickchart {\n",
  888. " --bg-color: #E8F0FE;\n",
  889. " --fill-color: #1967D2;\n",
  890. " --hover-bg-color: #E2EBFA;\n",
  891. " --hover-fill-color: #174EA6;\n",
  892. " --disabled-fill-color: #AAA;\n",
  893. " --disabled-bg-color: #DDD;\n",
  894. " }\n",
  895. "\n",
  896. " [theme=dark] .colab-df-quickchart {\n",
  897. " --bg-color: #3B4455;\n",
  898. " --fill-color: #D2E3FC;\n",
  899. " --hover-bg-color: #434B5C;\n",
  900. " --hover-fill-color: #FFFFFF;\n",
  901. " --disabled-bg-color: #3B4455;\n",
  902. " --disabled-fill-color: #666;\n",
  903. " }\n",
  904. "\n",
  905. " .colab-df-quickchart {\n",
  906. " background-color: var(--bg-color);\n",
  907. " border: none;\n",
  908. " border-radius: 50%;\n",
  909. " cursor: pointer;\n",
  910. " display: none;\n",
  911. " fill: var(--fill-color);\n",
  912. " height: 32px;\n",
  913. " padding: 0;\n",
  914. " width: 32px;\n",
  915. " }\n",
  916. "\n",
  917. " .colab-df-quickchart:hover {\n",
  918. " background-color: var(--hover-bg-color);\n",
  919. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  920. " fill: var(--button-hover-fill-color);\n",
  921. " }\n",
  922. "\n",
  923. " .colab-df-quickchart-complete:disabled,\n",
  924. " .colab-df-quickchart-complete:disabled:hover {\n",
  925. " background-color: var(--disabled-bg-color);\n",
  926. " fill: var(--disabled-fill-color);\n",
  927. " box-shadow: none;\n",
  928. " }\n",
  929. "\n",
  930. " .colab-df-spinner {\n",
  931. " border: 2px solid var(--fill-color);\n",
  932. " border-color: transparent;\n",
  933. " border-bottom-color: var(--fill-color);\n",
  934. " animation:\n",
  935. " spin 1s steps(1) infinite;\n",
  936. " }\n",
  937. "\n",
  938. " @keyframes spin {\n",
  939. " 0% {\n",
  940. " border-color: transparent;\n",
  941. " border-bottom-color: var(--fill-color);\n",
  942. " border-left-color: var(--fill-color);\n",
  943. " }\n",
  944. " 20% {\n",
  945. " border-color: transparent;\n",
  946. " border-left-color: var(--fill-color);\n",
  947. " border-top-color: var(--fill-color);\n",
  948. " }\n",
  949. " 30% {\n",
  950. " border-color: transparent;\n",
  951. " border-left-color: var(--fill-color);\n",
  952. " border-top-color: var(--fill-color);\n",
  953. " border-right-color: var(--fill-color);\n",
  954. " }\n",
  955. " 40% {\n",
  956. " border-color: transparent;\n",
  957. " border-right-color: var(--fill-color);\n",
  958. " border-top-color: var(--fill-color);\n",
  959. " }\n",
  960. " 60% {\n",
  961. " border-color: transparent;\n",
  962. " border-right-color: var(--fill-color);\n",
  963. " }\n",
  964. " 80% {\n",
  965. " border-color: transparent;\n",
  966. " border-right-color: var(--fill-color);\n",
  967. " border-bottom-color: var(--fill-color);\n",
  968. " }\n",
  969. " 90% {\n",
  970. " border-color: transparent;\n",
  971. " border-bottom-color: var(--fill-color);\n",
  972. " }\n",
  973. " }\n",
  974. "</style>\n",
  975. "\n",
  976. " <script>\n",
  977. " async function quickchart(key) {\n",
  978. " const quickchartButtonEl =\n",
  979. " document.querySelector('#' + key + ' button');\n",
  980. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  981. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  982. " try {\n",
  983. " const charts = await google.colab.kernel.invokeFunction(\n",
  984. " 'suggestCharts', [key], {});\n",
  985. " } catch (error) {\n",
  986. " console.error('Error during call to suggestCharts:', error);\n",
  987. " }\n",
  988. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  989. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  990. " }\n",
  991. " (() => {\n",
  992. " let quickchartButtonEl =\n",
  993. " document.querySelector('#df-d4cfcdcc-6f8d-4880-b0b5-afb4b26471f1 button');\n",
  994. " quickchartButtonEl.style.display =\n",
  995. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  996. " })();\n",
  997. " </script>\n",
  998. " </div>\n",
  999. " </div>\n",
  1000. " </div>\n"
  1001. ],
  1002. "application/vnd.google.colaboratory.intrinsic+json": {
  1003. "type": "dataframe",
  1004. "variable_name": "sentence_bench",
  1005. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  1006. }
  1007. },
  1008. "metadata": {},
  1009. "execution_count": 17
  1010. }
  1011. ]
  1012. },
  1013. {
  1014. "cell_type": "markdown",
  1015. "metadata": {
  1016. "id": "wDV7ysXf2b_H"
  1017. },
  1018. "source": [
  1019. "### Get ManaTTS"
  1020. ]
  1021. },
  1022. {
  1023. "cell_type": "code",
  1024. "execution_count": null,
  1025. "metadata": {
  1026. "colab": {
  1027. "base_uri": "https://localhost:8080/"
  1028. },
  1029. "id": "TcL5ZLvSSnVB",
  1030. "outputId": "a97f69de-1be2-4f16-eb91-f09665363313"
  1031. },
  1032. "outputs": [
  1033. {
  1034. "output_type": "execute_result",
  1035. "data": {
  1036. "text/plain": [
  1037. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  1038. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  1039. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  1040. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  1041. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  1042. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  1043. ]
  1044. },
  1045. "metadata": {},
  1046. "execution_count": 18
  1047. }
  1048. ],
  1049. "source": [
  1050. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  1051. "\n",
  1052. "# Convert to a list of tuples\n",
  1053. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  1054. "\n",
  1055. "mana_evaluation_data[:3]"
  1056. ]
  1057. },
  1058. {
  1059. "cell_type": "markdown",
  1060. "metadata": {
  1061. "id": "Jjacw9Mp2eoX"
  1062. },
  1063. "source": [
  1064. "### Get CommonVoice"
  1065. ]
  1066. },
  1067. {
  1068. "cell_type": "code",
  1069. "execution_count": null,
  1070. "metadata": {
  1071. "id": "-yQnqCGw26sk",
  1072. "colab": {
  1073. "base_uri": "https://localhost:8080/"
  1074. },
  1075. "outputId": "c067a709-b33a-4386-960a-3d3eabbbe1a4"
  1076. },
  1077. "outputs": [
  1078. {
  1079. "output_type": "execute_result",
  1080. "data": {
  1081. "text/plain": [
  1082. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  1083. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  1084. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  1085. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  1086. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  1087. ]
  1088. },
  1089. "metadata": {},
  1090. "execution_count": 19
  1091. }
  1092. ],
  1093. "source": [
  1094. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  1095. "\n",
  1096. "# Convert to a list of tuples\n",
  1097. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  1098. "\n",
  1099. "commonvoice_evaluation_data[:3]"
  1100. ]
  1101. },
  1102. {
  1103. "cell_type": "markdown",
  1104. "metadata": {
  1105. "id": "ciSPyhRc3Rvo"
  1106. },
  1107. "source": [
  1108. "### Get Homograph"
  1109. ]
  1110. },
  1111. {
  1112. "cell_type": "code",
  1113. "execution_count": null,
  1114. "metadata": {
  1115. "id": "XlFc5JbN3Rvz",
  1116. "colab": {
  1117. "base_uri": "https://localhost:8080/"
  1118. },
  1119. "outputId": "a87599ed-4e71-4300-bd19-11105420dbad"
  1120. },
  1121. "outputs": [
  1122. {
  1123. "output_type": "execute_result",
  1124. "data": {
  1125. "text/plain": [
  1126. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  1127. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  1128. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  1129. " 'قدر',\n",
  1130. " 'qadar'),\n",
  1131. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  1132. ]
  1133. },
  1134. "metadata": {},
  1135. "execution_count": 20
  1136. }
  1137. ],
  1138. "source": [
  1139. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  1140. "\n",
  1141. "# Convert to a list of tuples\n",
  1142. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  1143. "\n",
  1144. "homograph_evaluation_data[:3]"
  1145. ]
  1146. },
  1147. {
  1148. "cell_type": "markdown",
  1149. "metadata": {
  1150. "id": "R6PE5ds45TPr"
  1151. },
  1152. "source": [
  1153. "# Evaluate Method Outputs"
  1154. ]
  1155. },
  1156. {
  1157. "cell_type": "markdown",
  1158. "metadata": {
  1159. "id": "CLKaERek4u_D"
  1160. },
  1161. "source": [
  1162. "## PER Evaluation"
  1163. ]
  1164. },
  1165. {
  1166. "cell_type": "code",
  1167. "execution_count": null,
  1168. "metadata": {
  1169. "id": "nBee9xG54u_E"
  1170. },
  1171. "outputs": [],
  1172. "source": [
  1173. "def remove_non_word_chars(text):\n",
  1174. " pattern = r'[^\\w\\s\\?]'\n",
  1175. " cleaned_text = re.sub(pattern, ' ', text)\n",
  1176. " return cleaned_text"
  1177. ]
  1178. },
  1179. {
  1180. "cell_type": "code",
  1181. "execution_count": null,
  1182. "metadata": {
  1183. "id": "W8PoNV9V4u_E"
  1184. },
  1185. "outputs": [],
  1186. "source": [
  1187. "def remove_white_spaces(text):\n",
  1188. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  1189. " return cleaned_text.strip()"
  1190. ]
  1191. },
  1192. {
  1193. "cell_type": "code",
  1194. "execution_count": null,
  1195. "metadata": {
  1196. "id": "YD0cvnn74u_E"
  1197. },
  1198. "outputs": [],
  1199. "source": [
  1200. "def get_word_only_text(text):\n",
  1201. " word_only_text = remove_non_word_chars(text)\n",
  1202. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  1203. "\n",
  1204. " return extra_space_removed_text"
  1205. ]
  1206. },
  1207. {
  1208. "cell_type": "code",
  1209. "execution_count": null,
  1210. "metadata": {
  1211. "id": "6OQQDual4u_E"
  1212. },
  1213. "outputs": [],
  1214. "source": [
  1215. "def get_texts_cer(reference, model_output):\n",
  1216. " # Preprocess input texts to only contain word characters\n",
  1217. " word_only_reference = get_word_only_text(reference)\n",
  1218. " word_only_output = get_word_only_text(model_output)\n",
  1219. "\n",
  1220. " # Return +infinity for CER if any of the texts is empty\n",
  1221. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  1222. " return float('inf')\n",
  1223. "\n",
  1224. " return cer(word_only_reference, word_only_output)"
  1225. ]
  1226. },
  1227. {
  1228. "cell_type": "code",
  1229. "execution_count": null,
  1230. "metadata": {
  1231. "id": "ncWQnPdW4u_E"
  1232. },
  1233. "outputs": [],
  1234. "source": [
  1235. "def get_avg_cer_of_method(method_outputs, references):\n",
  1236. " cers = []\n",
  1237. " for idx, o in enumerate(method_outputs):\n",
  1238. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  1239. " if cer != float('inf'):\n",
  1240. " cers.append(cer)\n",
  1241. "\n",
  1242. " return sum(cers) / len(cers)"
  1243. ]
  1244. },
  1245. {
  1246. "cell_type": "markdown",
  1247. "source": [
  1248. "## Homograph Evaluation"
  1249. ],
  1250. "metadata": {
  1251. "id": "oBgNtpFQDwku"
  1252. }
  1253. },
  1254. {
  1255. "cell_type": "code",
  1256. "source": [
  1257. "def get_homograph_performance(outputs, references):\n",
  1258. " corrects = 0\n",
  1259. " total = 0\n",
  1260. "\n",
  1261. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  1262. " if homograph != '':\n",
  1263. " total += 1\n",
  1264. " if right in outputs[idx]:\n",
  1265. " corrects += 1\n",
  1266. "\n",
  1267. " return corrects / total"
  1268. ],
  1269. "metadata": {
  1270. "id": "J445ULEvEEDn"
  1271. },
  1272. "execution_count": null,
  1273. "outputs": []
  1274. },
  1275. {
  1276. "cell_type": "markdown",
  1277. "source": [
  1278. "# Full bench"
  1279. ],
  1280. "metadata": {
  1281. "id": "JGEUIrbi9kNH"
  1282. }
  1283. },
  1284. {
  1285. "cell_type": "code",
  1286. "source": [
  1287. "benchmark = []\n",
  1288. "\n",
  1289. "for g, p in mana_evaluation_data:\n",
  1290. " benchmark.append((g, p, '', ''))\n",
  1291. "\n",
  1292. "for g, p in commonvoice_evaluation_data:\n",
  1293. " benchmark.append((g, p, '', ''))\n",
  1294. "\n",
  1295. "for g, p, w, r in homograph_evaluation_data:\n",
  1296. " benchmark.append((g, p, w, r))\n",
  1297. "\n",
  1298. "benchmark = benchmark[:400]"
  1299. ],
  1300. "metadata": {
  1301. "id": "fGzQvL8V9mln"
  1302. },
  1303. "execution_count": null,
  1304. "outputs": []
  1305. },
  1306. {
  1307. "cell_type": "code",
  1308. "source": [
  1309. "def print_all_metrics(predictions):\n",
  1310. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1311. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1312. "\n",
  1313. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1314. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1315. ],
  1316. "metadata": {
  1317. "id": "DpSqE5oPbmAy"
  1318. },
  1319. "execution_count": null,
  1320. "outputs": []
  1321. },
  1322. {
  1323. "cell_type": "markdown",
  1324. "source": [
  1325. "# outputs"
  1326. ],
  1327. "metadata": {
  1328. "id": "DsyvYuOPHTh0"
  1329. }
  1330. },
  1331. {
  1332. "cell_type": "code",
  1333. "source": [
  1334. "from tqdm import tqdm\n",
  1335. "import time\n",
  1336. "\n",
  1337. "outputs = []\n",
  1338. "start_time = time.time()\n",
  1339. "\n",
  1340. "for g, p, _, _ in tqdm(benchmark):\n",
  1341. " o = run_script_with_args(g)\n",
  1342. " outputs.append(o)\n",
  1343. "\n",
  1344. "total_time = time.time() - start_time\n",
  1345. "avg_time = total_time / len(benchmark) if len(benchmark) > 0 else 0"
  1346. ],
  1347. "metadata": {
  1348. "colab": {
  1349. "base_uri": "https://localhost:8080/"
  1350. },
  1351. "outputId": "9d2752bd-22b3-41ff-9173-d77aefff7baf",
  1352. "id": "VeCOWPbeHTh1"
  1353. },
  1354. "execution_count": null,
  1355. "outputs": [
  1356. {
  1357. "output_type": "stream",
  1358. "name": "stderr",
  1359. "text": [
  1360. "100%|██████████| 400/400 [1:20:06<00:00, 12.02s/it]\n"
  1361. ]
  1362. }
  1363. ]
  1364. },
  1365. {
  1366. "cell_type": "code",
  1367. "source": [
  1368. "mapped_outputs = []\n",
  1369. "for o in outputs:\n",
  1370. " mapped = replace_phonetic_characters(o)\n",
  1371. " mapped_outputs.append(mapped)"
  1372. ],
  1373. "metadata": {
  1374. "id": "K-catlB6Esuf"
  1375. },
  1376. "execution_count": null,
  1377. "outputs": []
  1378. },
  1379. {
  1380. "cell_type": "code",
  1381. "source": [
  1382. "print_all_metrics(mapped_outputs)\n",
  1383. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1384. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)\")"
  1385. ],
  1386. "metadata": {
  1387. "id": "H2taHCPWCnls",
  1388. "colab": {
  1389. "base_uri": "https://localhost:8080/"
  1390. },
  1391. "outputId": "55a4e1a0-0e99-4cde-b677-919b835cfb41"
  1392. },
  1393. "execution_count": null,
  1394. "outputs": [
  1395. {
  1396. "output_type": "stream",
  1397. "name": "stdout",
  1398. "text": [
  1399. "PER: \t\t\t35.2321\n",
  1400. "HOMOGRAPH: \t\t21.2264\n",
  1401. "TOTAL TIME:\t\t4806.7417 (s)\n",
  1402. "AVG TIME:\t\t12.0169 (s)\n"
  1403. ]
  1404. }
  1405. ]
  1406. },
  1407. {
  1408. "cell_type": "markdown",
  1409. "source": [
  1410. "# Runs\n",
  1411. "\n",
  1412. "## First:\n",
  1413. "\n",
  1414. "```\n",
  1415. "PER: \t\t\t35.2321\n",
  1416. "HOMOGRAPH: \t\t21.2264\n",
  1417. "TOTAL TIME:\t\t4223.2665 (s)\n",
  1418. "AVG TIME:\t\t10.5582 (s)\n",
  1419. "```\n",
  1420. "\n",
  1421. "## Second\n",
  1422. "\n",
  1423. "```\n",
  1424. "PER: \t\t\t35.2321\n",
  1425. "HOMOGRAPH: \t\t21.2264\n",
  1426. "TOTAL TIME:\t\t4512.4389 (s)\n",
  1427. "AVG TIME:\t\t11.2811 (s)\n",
  1428. "```\n",
  1429. "\n",
  1430. "## Third\n",
  1431. "\n",
  1432. "```\n",
  1433. "PER: \t\t\t35.2321\n",
  1434. "HOMOGRAPH: \t\t21.2264\n",
  1435. "TOTAL TIME:\t\t4413.8484 (s)\n",
  1436. "AVG TIME:\t\t11.0346 (s)\n",
  1437. "```\n",
  1438. "\n",
  1439. "## Fourth\n",
  1440. "\n",
  1441. "```\n",
  1442. "PER: \t\t\t35.2321\n",
  1443. "HOMOGRAPH: \t\t21.2264\n",
  1444. "TOTAL TIME:\t\t4318.4309 (s)\n",
  1445. "AVG TIME:\t\t10.7961 (s)\n",
  1446. "```\n",
  1447. "\n",
  1448. "## Fifth\n",
  1449. "\n",
  1450. "```\n",
  1451. "PER: \t\t\t35.2321\n",
  1452. "HOMOGRAPH: \t\t21.2264\n",
  1453. "TOTAL TIME:\t\t4806.7417 (s)\n",
  1454. "AVG TIME:\t\t12.0169 (s)\n",
  1455. "```"
  1456. ],
  1457. "metadata": {
  1458. "id": "aTXAKXmkJCSE"
  1459. }
  1460. }
  1461. ]
  1462. }