Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_dmort27_epitran.ipynb 49KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": [],
  7. "collapsed_sections": [
  8. "VtxEYym69RUH",
  9. "XjAPkfq7SF87"
  10. ]
  11. },
  12. "kernelspec": {
  13. "name": "python3",
  14. "display_name": "Python 3"
  15. },
  16. "language_info": {
  17. "name": "python"
  18. }
  19. },
  20. "cells": [
  21. {
  22. "cell_type": "markdown",
  23. "metadata": {
  24. "id": "WEY5MiKLzurH"
  25. },
  26. "source": [
  27. "# Setup Environment"
  28. ]
  29. },
  30. {
  31. "cell_type": "code",
  32. "source": [
  33. "! pip install epitran==1.26.0"
  34. ],
  35. "metadata": {
  36. "id": "jviCS0zCmtJc",
  37. "colab": {
  38. "base_uri": "https://localhost:8080/"
  39. },
  40. "outputId": "e8d100ba-e606-4956-ee15-81ccc6557ba6"
  41. },
  42. "execution_count": null,
  43. "outputs": [
  44. {
  45. "output_type": "stream",
  46. "name": "stdout",
  47. "text": [
  48. "Collecting epitran==1.26.0\n",
  49. " Downloading epitran-1.26.0-py2.py3-none-any.whl.metadata (34 kB)\n",
  50. "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (75.2.0)\n",
  51. "Requirement already satisfied: regex in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (2024.11.6)\n",
  52. "Collecting panphon>=0.20 (from epitran==1.26.0)\n",
  53. " Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)\n",
  54. "Requirement already satisfied: marisa-trie in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (1.2.1)\n",
  55. "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from epitran==1.26.0) (2.32.3)\n",
  56. "Collecting jamo (from epitran==1.26.0)\n",
  57. " Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)\n",
  58. "Collecting unicodecsv (from panphon>=0.20->epitran==1.26.0)\n",
  59. " Downloading unicodecsv-0.14.1.tar.gz (10 kB)\n",
  60. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  61. "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (6.0.2)\n",
  62. "Requirement already satisfied: numpy>=1.20.2 in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (2.0.2)\n",
  63. "Requirement already satisfied: editdistance in /usr/local/lib/python3.11/dist-packages (from panphon>=0.20->epitran==1.26.0) (0.8.1)\n",
  64. "Collecting munkres (from panphon>=0.20->epitran==1.26.0)\n",
  65. " Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)\n",
  66. "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (3.4.1)\n",
  67. "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (3.10)\n",
  68. "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (2.4.0)\n",
  69. "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->epitran==1.26.0) (2025.4.26)\n",
  70. "Downloading epitran-1.26.0-py2.py3-none-any.whl (188 kB)\n",
  71. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.5/188.5 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  72. "\u001b[?25hDownloading panphon-0.21.2-py2.py3-none-any.whl (75 kB)\n",
  73. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  74. "\u001b[?25hDownloading jamo-0.4.1-py3-none-any.whl (9.5 kB)\n",
  75. "Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)\n",
  76. "Building wheels for collected packages: unicodecsv\n",
  77. " Building wheel for unicodecsv (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  78. " Created wheel for unicodecsv: filename=unicodecsv-0.14.1-py3-none-any.whl size=10744 sha256=9d5442e17e65cdf34cadb6d4681337702fde69e9bea33a290ccb2bc88151e8b5\n",
  79. " Stored in directory: /root/.cache/pip/wheels/ec/03/6f/d2e0162d94c0d451556fa43dd4d5531457245c34a36b41ef4a\n",
  80. "Successfully built unicodecsv\n",
  81. "Installing collected packages: unicodecsv, munkres, jamo, panphon, epitran\n",
  82. "Successfully installed epitran-1.26.0 jamo-0.4.1 munkres-1.1.4 panphon-0.21.2 unicodecsv-0.14.1\n"
  83. ]
  84. }
  85. ]
  86. },
  87. {
  88. "cell_type": "code",
  89. "source": [
  90. "! pip install g2pk==0.9.4"
  91. ],
  92. "metadata": {
  93. "colab": {
  94. "base_uri": "https://localhost:8080/"
  95. },
  96. "id": "vxh7pA-mwSDV",
  97. "outputId": "f03e0881-3acb-4ab1-fbbb-016a0c4069f3"
  98. },
  99. "execution_count": null,
  100. "outputs": [
  101. {
  102. "output_type": "stream",
  103. "name": "stdout",
  104. "text": [
  105. "Collecting g2pk==0.9.4\n",
  106. " Downloading g2pK-0.9.4-py3-none-any.whl.metadata (7.5 kB)\n",
  107. "Requirement already satisfied: jamo in /usr/local/lib/python3.11/dist-packages (from g2pk==0.9.4) (0.4.1)\n",
  108. "Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (from g2pk==0.9.4) (3.9.1)\n",
  109. "Collecting konlpy (from g2pk==0.9.4)\n",
  110. " Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)\n",
  111. "Collecting python-mecab-ko (from g2pk==0.9.4)\n",
  112. " Downloading python_mecab_ko-1.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
  113. "Collecting JPype1>=0.7.0 (from konlpy->g2pk==0.9.4)\n",
  114. " Downloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
  115. "Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.11/dist-packages (from konlpy->g2pk==0.9.4) (5.4.0)\n",
  116. "Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.11/dist-packages (from konlpy->g2pk==0.9.4) (2.0.2)\n",
  117. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (8.1.8)\n",
  118. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (1.4.2)\n",
  119. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (2024.11.6)\n",
  120. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk->g2pk==0.9.4) (4.67.1)\n",
  121. "Collecting python-mecab-ko-dic (from python-mecab-ko->g2pk==0.9.4)\n",
  122. " Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl.metadata (1.4 kB)\n",
  123. "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from JPype1>=0.7.0->konlpy->g2pk==0.9.4) (24.2)\n",
  124. "Downloading g2pK-0.9.4-py3-none-any.whl (27 kB)\n",
  125. "Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)\n",
  126. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.4/19.4 MB\u001b[0m \u001b[31m61.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  127. "\u001b[?25hDownloading python_mecab_ko-1.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (580 kB)\n",
  128. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m580.9/580.9 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  129. "\u001b[?25hDownloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)\n",
  130. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m494.1/494.1 kB\u001b[0m \u001b[31m33.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  131. "\u001b[?25hDownloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl (34.5 MB)\n",
  132. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.5/34.5 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  133. "\u001b[?25hInstalling collected packages: python-mecab-ko-dic, python-mecab-ko, JPype1, konlpy, g2pk\n",
  134. "Successfully installed JPype1-1.5.2 g2pk-0.9.4 konlpy-0.6.0 python-mecab-ko-1.3.7 python-mecab-ko-dic-2.1.1.post2\n"
  135. ]
  136. }
  137. ]
  138. },
  139. {
  140. "cell_type": "code",
  141. "source": [
  142. "! pip install jiwer"
  143. ],
  144. "metadata": {
  145. "colab": {
  146. "base_uri": "https://localhost:8080/"
  147. },
  148. "id": "stR7NfnfZqB1",
  149. "outputId": "c5e09b12-0552-4e2d-fd8f-387c8308d1c4"
  150. },
  151. "execution_count": null,
  152. "outputs": [
  153. {
  154. "output_type": "stream",
  155. "name": "stdout",
  156. "text": [
  157. "Collecting jiwer\n",
  158. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  159. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  160. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  161. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  162. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  163. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  164. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  165. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  166. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  167. ]
  168. }
  169. ]
  170. },
  171. {
  172. "cell_type": "code",
  173. "execution_count": null,
  174. "metadata": {
  175. "id": "PfthI4eOqBri"
  176. },
  177. "outputs": [],
  178. "source": [
  179. "import os\n",
  180. "import re\n",
  181. "import csv\n",
  182. "import pandas as pd\n",
  183. "import json\n",
  184. "import itertools\n",
  185. "from tqdm import tqdm\n",
  186. "from jiwer import cer"
  187. ]
  188. },
  189. {
  190. "cell_type": "markdown",
  191. "source": [
  192. "# mapping"
  193. ],
  194. "metadata": {
  195. "id": "VtxEYym69RUH"
  196. }
  197. },
  198. {
  199. "cell_type": "code",
  200. "source": [
  201. "output_to_phonetics_map = {\n",
  202. " 'м': 'm',\n",
  203. " 'ʷ':' v',\n",
  204. " 'w': 'v',\n",
  205. " 'c': 'k',\n",
  206. " 'ĉ': 'C',\n",
  207. " 'č': 'C',\n",
  208. " '̕': \"?\",\n",
  209. " \"'\": '?',\n",
  210. " 'ʔ': \"?\",\n",
  211. " 'ꞌ': \"?\",\n",
  212. " '̛': \"?\",\n",
  213. " '’': \"?\",\n",
  214. " 'ʼ': \"?\",\n",
  215. " \"'\": '?',\n",
  216. " 'â': 'A',\n",
  217. " 'â': 'A',\n",
  218. " 'ȃ': 'A',\n",
  219. " 'ž': 'Z',\n",
  220. " 'š': 'S',\n",
  221. " 'W': 'v',\n",
  222. " 'β': 'f',\n",
  223. " 'е': 'e',\n",
  224. " '`': \"?\",\n",
  225. " 'ɑ': 'A',\n",
  226. " 'ɑ': 'A',\n",
  227. " 'ʃ': 'S',\n",
  228. " 'ð': 'z',\n",
  229. " 'ɾ': 'r',\n",
  230. " 'æ': 'a',\n",
  231. " 'ɪ': 'e',\n",
  232. " 'χ': 'x',\n",
  233. " 'ɣ': 'q',\n",
  234. " 'ʒ': 'Z',\n",
  235. " ':': '',\n",
  236. " 'ː': '',\n",
  237. " 'ā': 'A',\n",
  238. " 'ː': '',\n",
  239. " 'ä': 'A',\n",
  240. " 'á': 'A',\n",
  241. " 'š': 'S',\n",
  242. " 'ū': 'u',\n",
  243. " 'û': 'u',\n",
  244. " 'ś': 's',\n",
  245. " 'ī': 'i',\n",
  246. " 'í': 'i',\n",
  247. " 'î': 'i',\n",
  248. " 'é': 'e',\n",
  249. " 'ḥ': 'h',\n",
  250. " 'ɒ': 'A',\n",
  251. " 'ʰ': '',\n",
  252. " 'ə': 'e',\n",
  253. " 'R': 'r',\n",
  254. " 'W': 'v',\n",
  255. " 'Q': 'q',\n",
  256. " 'T': 't',\n",
  257. " 'Y': 'y',\n",
  258. " 'P': 'p',\n",
  259. " 'D': 'd',\n",
  260. " 'F': 'f',\n",
  261. " 'H': 'h',\n",
  262. " 'J': 'j',\n",
  263. " 'L': 'l',\n",
  264. " 'X': 'x',\n",
  265. " 'V': 'v',\n",
  266. " 'B': 'b',\n",
  267. " 'N': 'n',\n",
  268. " 'M': 'm',\n",
  269. " 'K': 'k',\n",
  270. " 'G': 'g',\n",
  271. " 'U': 'u',\n",
  272. " 'O': 'o',\n",
  273. " 'I': 'i',\n",
  274. " 'E': 'e',\n",
  275. " 'ŋ': 'ng',\n",
  276. " '.': '',\n",
  277. " 'ɛ': 'e',\n",
  278. " 'ʊ': 'u',\n",
  279. " \"ˈ\": '?',\n",
  280. " 'ù': 'u',\n",
  281. " 'θ': 's',\n",
  282. " '̪': '',\n",
  283. " 'ũ': 'u',\n",
  284. " '_': '',\n",
  285. " 'ç': 'C',\n",
  286. " 'ĝ': 'q',\n",
  287. " 'ɢ': 'q',\n",
  288. " 'ː': '',\n",
  289. " 'í': 'i',\n",
  290. " 'ŝ': 'S',\n",
  291. " '!': '',\n",
  292. " 'ǧ': 'q',\n",
  293. " 'ʻ': '?',\n",
  294. " 'è': 'e',\n",
  295. " '�': '',\n",
  296. " 'ú': 'u',\n",
  297. " 'ô': 'o',\n",
  298. " 'ē': 'e',\n",
  299. " 'à': 'A',\n",
  300. " 'ă': 'A',\n",
  301. " 'ǐ': 'i',\n",
  302. " 'ü': 'u',\n",
  303. " '\\u200e': '',\n",
  304. " 'ğ': 'q',\n",
  305. " 'ṣ': 'S',\n",
  306. " 'â': 'A',\n",
  307. " 'â': 'A',\n",
  308. " 'ȃ': 'A',\n",
  309. " 'ž': 'Z',\n",
  310. " 'š': 'S',\n",
  311. " 'ā': 'A',\n",
  312. " 'ː': '',\n",
  313. " 'ä': 'A',\n",
  314. " 'á': 'A',\n",
  315. " 'š': 'S',\n",
  316. " 'ū': 'u',\n",
  317. " 'û': 'u',\n",
  318. " 'ś': 'S',\n",
  319. " 'ī': 'i',\n",
  320. " 'í': 'i',\n",
  321. " 'î': 'i',\n",
  322. " 'é': 'e',\n",
  323. "}\n",
  324. "\n",
  325. "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n",
  326. "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n",
  327. "\n",
  328. "\n",
  329. "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n",
  330. " substituted = re.sub(r'tʃʰ', 'C', input_string)\n",
  331. " substituted = re.sub(r't͡ʃ', 'C', input_string)\n",
  332. " substituted = re.sub(r'tʃ', 'C', substituted)\n",
  333. " substituted = re.sub(r't͡S', 'C', substituted)\n",
  334. " substituted = re.sub(r'ow', 'o', substituted)\n",
  335. " substituted = re.sub('d͡ʒ', 'j', substituted)\n",
  336. " substituted = re.sub('dʒ', 'j', substituted)\n",
  337. "\n",
  338. " # Create a translation table using str.maketrans\n",
  339. " translation_table = str.maketrans(char_map)\n",
  340. "\n",
  341. " # Use str.translate to replace characters based on the translation table\n",
  342. " translated = substituted.translate(translation_table)\n",
  343. "\n",
  344. " return translated"
  345. ],
  346. "metadata": {
  347. "id": "TKx8oA1n7rKh"
  348. },
  349. "execution_count": null,
  350. "outputs": []
  351. },
  352. {
  353. "cell_type": "markdown",
  354. "metadata": {
  355. "id": "XjAPkfq7SF87"
  356. },
  357. "source": [
  358. "# Get Evaluation Data"
  359. ]
  360. },
  361. {
  362. "cell_type": "code",
  363. "source": [
  364. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  365. ],
  366. "metadata": {
  367. "id": "qwCG0jX-88nQ",
  368. "colab": {
  369. "base_uri": "https://localhost:8080/"
  370. },
  371. "outputId": "bda9ccb4-f4d8-432b-f460-bfcbea7e462b"
  372. },
  373. "execution_count": null,
  374. "outputs": [
  375. {
  376. "output_type": "stream",
  377. "name": "stdout",
  378. "text": [
  379. "--2025-05-10 11:19:00-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  380. "Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.55, 18.164.174.118, ...\n",
  381. "Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.\n",
  382. "HTTP request sent, awaiting response... 200 OK\n",
  383. "Length: 56026 (55K) [text/plain]\n",
  384. "Saving to: ‘SentenceBench.csv’\n",
  385. "\n",
  386. "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.008s \n",
  387. "\n",
  388. "2025-05-10 11:19:00 (6.90 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  389. "\n"
  390. ]
  391. }
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "source": [
  397. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  398. ],
  399. "metadata": {
  400. "id": "hJO-UAPDQvcb"
  401. },
  402. "execution_count": null,
  403. "outputs": []
  404. },
  405. {
  406. "cell_type": "code",
  407. "source": [
  408. "sentence_bench.head(3)"
  409. ],
  410. "metadata": {
  411. "colab": {
  412. "base_uri": "https://localhost:8080/",
  413. "height": 143
  414. },
  415. "id": "qlYbrnUa9LAN",
  416. "outputId": "2fa1904b-72eb-4df9-9d92-f3918ce8ccf3"
  417. },
  418. "execution_count": null,
  419. "outputs": [
  420. {
  421. "output_type": "execute_result",
  422. "data": {
  423. "text/plain": [
  424. " dataset grapheme \\\n",
  425. "0 homograph من قدر تو را می‌دانم \n",
  426. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  427. "2 homograph به دست و صورتم کرم زدم \n",
  428. "\n",
  429. " phoneme homograph word \\\n",
  430. "0 man qadr-e to rA mi-dAnam قدر \n",
  431. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  432. "2 be dast-o suratam kerem zadam کرم \n",
  433. "\n",
  434. " pronunciation \n",
  435. "0 qadr \n",
  436. "1 qadar \n",
  437. "2 kerem "
  438. ],
  439. "text/html": [
  440. "\n",
  441. " <div id=\"df-d1a95323-5d76-483e-a34d-ea1e21f453f0\" class=\"colab-df-container\">\n",
  442. " <div>\n",
  443. "<style scoped>\n",
  444. " .dataframe tbody tr th:only-of-type {\n",
  445. " vertical-align: middle;\n",
  446. " }\n",
  447. "\n",
  448. " .dataframe tbody tr th {\n",
  449. " vertical-align: top;\n",
  450. " }\n",
  451. "\n",
  452. " .dataframe thead th {\n",
  453. " text-align: right;\n",
  454. " }\n",
  455. "</style>\n",
  456. "<table border=\"1\" class=\"dataframe\">\n",
  457. " <thead>\n",
  458. " <tr style=\"text-align: right;\">\n",
  459. " <th></th>\n",
  460. " <th>dataset</th>\n",
  461. " <th>grapheme</th>\n",
  462. " <th>phoneme</th>\n",
  463. " <th>homograph word</th>\n",
  464. " <th>pronunciation</th>\n",
  465. " </tr>\n",
  466. " </thead>\n",
  467. " <tbody>\n",
  468. " <tr>\n",
  469. " <th>0</th>\n",
  470. " <td>homograph</td>\n",
  471. " <td>من قدر تو را می‌دانم</td>\n",
  472. " <td>man qadr-e to rA mi-dAnam</td>\n",
  473. " <td>قدر</td>\n",
  474. " <td>qadr</td>\n",
  475. " </tr>\n",
  476. " <tr>\n",
  477. " <th>1</th>\n",
  478. " <td>homograph</td>\n",
  479. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  480. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  481. " <td>قدر</td>\n",
  482. " <td>qadar</td>\n",
  483. " </tr>\n",
  484. " <tr>\n",
  485. " <th>2</th>\n",
  486. " <td>homograph</td>\n",
  487. " <td>به دست و صورتم کرم زدم</td>\n",
  488. " <td>be dast-o suratam kerem zadam</td>\n",
  489. " <td>کرم</td>\n",
  490. " <td>kerem</td>\n",
  491. " </tr>\n",
  492. " </tbody>\n",
  493. "</table>\n",
  494. "</div>\n",
  495. " <div class=\"colab-df-buttons\">\n",
  496. "\n",
  497. " <div class=\"colab-df-container\">\n",
  498. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d1a95323-5d76-483e-a34d-ea1e21f453f0')\"\n",
  499. " title=\"Convert this dataframe to an interactive table.\"\n",
  500. " style=\"display:none;\">\n",
  501. "\n",
  502. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  503. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  504. " </svg>\n",
  505. " </button>\n",
  506. "\n",
  507. " <style>\n",
  508. " .colab-df-container {\n",
  509. " display:flex;\n",
  510. " gap: 12px;\n",
  511. " }\n",
  512. "\n",
  513. " .colab-df-convert {\n",
  514. " background-color: #E8F0FE;\n",
  515. " border: none;\n",
  516. " border-radius: 50%;\n",
  517. " cursor: pointer;\n",
  518. " display: none;\n",
  519. " fill: #1967D2;\n",
  520. " height: 32px;\n",
  521. " padding: 0 0 0 0;\n",
  522. " width: 32px;\n",
  523. " }\n",
  524. "\n",
  525. " .colab-df-convert:hover {\n",
  526. " background-color: #E2EBFA;\n",
  527. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  528. " fill: #174EA6;\n",
  529. " }\n",
  530. "\n",
  531. " .colab-df-buttons div {\n",
  532. " margin-bottom: 4px;\n",
  533. " }\n",
  534. "\n",
  535. " [theme=dark] .colab-df-convert {\n",
  536. " background-color: #3B4455;\n",
  537. " fill: #D2E3FC;\n",
  538. " }\n",
  539. "\n",
  540. " [theme=dark] .colab-df-convert:hover {\n",
  541. " background-color: #434B5C;\n",
  542. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  543. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  544. " fill: #FFFFFF;\n",
  545. " }\n",
  546. " </style>\n",
  547. "\n",
  548. " <script>\n",
  549. " const buttonEl =\n",
  550. " document.querySelector('#df-d1a95323-5d76-483e-a34d-ea1e21f453f0 button.colab-df-convert');\n",
  551. " buttonEl.style.display =\n",
  552. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  553. "\n",
  554. " async function convertToInteractive(key) {\n",
  555. " const element = document.querySelector('#df-d1a95323-5d76-483e-a34d-ea1e21f453f0');\n",
  556. " const dataTable =\n",
  557. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  558. " [key], {});\n",
  559. " if (!dataTable) return;\n",
  560. "\n",
  561. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  562. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  563. " + ' to learn more about interactive tables.';\n",
  564. " element.innerHTML = '';\n",
  565. " dataTable['output_type'] = 'display_data';\n",
  566. " await google.colab.output.renderOutput(dataTable, element);\n",
  567. " const docLink = document.createElement('div');\n",
  568. " docLink.innerHTML = docLinkHtml;\n",
  569. " element.appendChild(docLink);\n",
  570. " }\n",
  571. " </script>\n",
  572. " </div>\n",
  573. "\n",
  574. "\n",
  575. " <div id=\"df-34db1839-0805-4a8a-bab1-4ca3ae0c2d29\">\n",
  576. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-34db1839-0805-4a8a-bab1-4ca3ae0c2d29')\"\n",
  577. " title=\"Suggest charts\"\n",
  578. " style=\"display:none;\">\n",
  579. "\n",
  580. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  581. " width=\"24px\">\n",
  582. " <g>\n",
  583. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  584. " </g>\n",
  585. "</svg>\n",
  586. " </button>\n",
  587. "\n",
  588. "<style>\n",
  589. " .colab-df-quickchart {\n",
  590. " --bg-color: #E8F0FE;\n",
  591. " --fill-color: #1967D2;\n",
  592. " --hover-bg-color: #E2EBFA;\n",
  593. " --hover-fill-color: #174EA6;\n",
  594. " --disabled-fill-color: #AAA;\n",
  595. " --disabled-bg-color: #DDD;\n",
  596. " }\n",
  597. "\n",
  598. " [theme=dark] .colab-df-quickchart {\n",
  599. " --bg-color: #3B4455;\n",
  600. " --fill-color: #D2E3FC;\n",
  601. " --hover-bg-color: #434B5C;\n",
  602. " --hover-fill-color: #FFFFFF;\n",
  603. " --disabled-bg-color: #3B4455;\n",
  604. " --disabled-fill-color: #666;\n",
  605. " }\n",
  606. "\n",
  607. " .colab-df-quickchart {\n",
  608. " background-color: var(--bg-color);\n",
  609. " border: none;\n",
  610. " border-radius: 50%;\n",
  611. " cursor: pointer;\n",
  612. " display: none;\n",
  613. " fill: var(--fill-color);\n",
  614. " height: 32px;\n",
  615. " padding: 0;\n",
  616. " width: 32px;\n",
  617. " }\n",
  618. "\n",
  619. " .colab-df-quickchart:hover {\n",
  620. " background-color: var(--hover-bg-color);\n",
  621. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  622. " fill: var(--button-hover-fill-color);\n",
  623. " }\n",
  624. "\n",
  625. " .colab-df-quickchart-complete:disabled,\n",
  626. " .colab-df-quickchart-complete:disabled:hover {\n",
  627. " background-color: var(--disabled-bg-color);\n",
  628. " fill: var(--disabled-fill-color);\n",
  629. " box-shadow: none;\n",
  630. " }\n",
  631. "\n",
  632. " .colab-df-spinner {\n",
  633. " border: 2px solid var(--fill-color);\n",
  634. " border-color: transparent;\n",
  635. " border-bottom-color: var(--fill-color);\n",
  636. " animation:\n",
  637. " spin 1s steps(1) infinite;\n",
  638. " }\n",
  639. "\n",
  640. " @keyframes spin {\n",
  641. " 0% {\n",
  642. " border-color: transparent;\n",
  643. " border-bottom-color: var(--fill-color);\n",
  644. " border-left-color: var(--fill-color);\n",
  645. " }\n",
  646. " 20% {\n",
  647. " border-color: transparent;\n",
  648. " border-left-color: var(--fill-color);\n",
  649. " border-top-color: var(--fill-color);\n",
  650. " }\n",
  651. " 30% {\n",
  652. " border-color: transparent;\n",
  653. " border-left-color: var(--fill-color);\n",
  654. " border-top-color: var(--fill-color);\n",
  655. " border-right-color: var(--fill-color);\n",
  656. " }\n",
  657. " 40% {\n",
  658. " border-color: transparent;\n",
  659. " border-right-color: var(--fill-color);\n",
  660. " border-top-color: var(--fill-color);\n",
  661. " }\n",
  662. " 60% {\n",
  663. " border-color: transparent;\n",
  664. " border-right-color: var(--fill-color);\n",
  665. " }\n",
  666. " 80% {\n",
  667. " border-color: transparent;\n",
  668. " border-right-color: var(--fill-color);\n",
  669. " border-bottom-color: var(--fill-color);\n",
  670. " }\n",
  671. " 90% {\n",
  672. " border-color: transparent;\n",
  673. " border-bottom-color: var(--fill-color);\n",
  674. " }\n",
  675. " }\n",
  676. "</style>\n",
  677. "\n",
  678. " <script>\n",
  679. " async function quickchart(key) {\n",
  680. " const quickchartButtonEl =\n",
  681. " document.querySelector('#' + key + ' button');\n",
  682. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  683. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  684. " try {\n",
  685. " const charts = await google.colab.kernel.invokeFunction(\n",
  686. " 'suggestCharts', [key], {});\n",
  687. " } catch (error) {\n",
  688. " console.error('Error during call to suggestCharts:', error);\n",
  689. " }\n",
  690. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  691. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  692. " }\n",
  693. " (() => {\n",
  694. " let quickchartButtonEl =\n",
  695. " document.querySelector('#df-34db1839-0805-4a8a-bab1-4ca3ae0c2d29 button');\n",
  696. " quickchartButtonEl.style.display =\n",
  697. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  698. " })();\n",
  699. " </script>\n",
  700. " </div>\n",
  701. " </div>\n",
  702. " </div>\n"
  703. ],
  704. "application/vnd.google.colaboratory.intrinsic+json": {
  705. "type": "dataframe",
  706. "variable_name": "sentence_bench",
  707. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  708. }
  709. },
  710. "metadata": {},
  711. "execution_count": 8
  712. }
  713. ]
  714. },
  715. {
  716. "cell_type": "markdown",
  717. "metadata": {
  718. "id": "wDV7ysXf2b_H"
  719. },
  720. "source": [
  721. "### Get ManaTTS"
  722. ]
  723. },
  724. {
  725. "cell_type": "code",
  726. "execution_count": null,
  727. "metadata": {
  728. "colab": {
  729. "base_uri": "https://localhost:8080/"
  730. },
  731. "id": "TcL5ZLvSSnVB",
  732. "outputId": "59e9cd68-4665-4b68-bc35-9d80d2cc03d9"
  733. },
  734. "outputs": [
  735. {
  736. "output_type": "execute_result",
  737. "data": {
  738. "text/plain": [
  739. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  740. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  741. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  742. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  743. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  744. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  745. ]
  746. },
  747. "metadata": {},
  748. "execution_count": 9
  749. }
  750. ],
  751. "source": [
  752. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  753. "\n",
  754. "# Convert to a list of tuples\n",
  755. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  756. "\n",
  757. "mana_evaluation_data[:3]"
  758. ]
  759. },
  760. {
  761. "cell_type": "markdown",
  762. "metadata": {
  763. "id": "Jjacw9Mp2eoX"
  764. },
  765. "source": [
  766. "### Get CommonVoice"
  767. ]
  768. },
  769. {
  770. "cell_type": "code",
  771. "execution_count": null,
  772. "metadata": {
  773. "id": "-yQnqCGw26sk",
  774. "colab": {
  775. "base_uri": "https://localhost:8080/"
  776. },
  777. "outputId": "253e406c-5fb7-4b8f-fc2e-25a289e5bb0d"
  778. },
  779. "outputs": [
  780. {
  781. "output_type": "execute_result",
  782. "data": {
  783. "text/plain": [
  784. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  785. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  786. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  787. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  788. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  789. ]
  790. },
  791. "metadata": {},
  792. "execution_count": 10
  793. }
  794. ],
  795. "source": [
  796. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  797. "\n",
  798. "# Convert to a list of tuples\n",
  799. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  800. "\n",
  801. "commonvoice_evaluation_data[:3]"
  802. ]
  803. },
  804. {
  805. "cell_type": "markdown",
  806. "metadata": {
  807. "id": "ciSPyhRc3Rvo"
  808. },
  809. "source": [
  810. "### Get Homograph"
  811. ]
  812. },
  813. {
  814. "cell_type": "code",
  815. "execution_count": null,
  816. "metadata": {
  817. "id": "XlFc5JbN3Rvz",
  818. "colab": {
  819. "base_uri": "https://localhost:8080/"
  820. },
  821. "outputId": "7d6b2c71-afe5-4e1b-dc9d-16c0581e3222"
  822. },
  823. "outputs": [
  824. {
  825. "output_type": "execute_result",
  826. "data": {
  827. "text/plain": [
  828. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  829. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  830. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  831. " 'قدر',\n",
  832. " 'qadar'),\n",
  833. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  834. ]
  835. },
  836. "metadata": {},
  837. "execution_count": 11
  838. }
  839. ],
  840. "source": [
  841. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  842. "\n",
  843. "# Convert to a list of tuples\n",
  844. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  845. "\n",
  846. "homograph_evaluation_data[:3]"
  847. ]
  848. },
  849. {
  850. "cell_type": "markdown",
  851. "metadata": {
  852. "id": "R6PE5ds45TPr"
  853. },
  854. "source": [
  855. "# Evaluate Method Outputs"
  856. ]
  857. },
  858. {
  859. "cell_type": "markdown",
  860. "metadata": {
  861. "id": "CLKaERek4u_D"
  862. },
  863. "source": [
  864. "## PER Evaluation"
  865. ]
  866. },
  867. {
  868. "cell_type": "code",
  869. "execution_count": null,
  870. "metadata": {
  871. "id": "nBee9xG54u_E"
  872. },
  873. "outputs": [],
  874. "source": [
  875. "def remove_non_word_chars(text):\n",
  876. " pattern = r'[^\\w\\s\\?]'\n",
  877. " cleaned_text = re.sub(pattern, ' ', text)\n",
  878. " return cleaned_text"
  879. ]
  880. },
  881. {
  882. "cell_type": "code",
  883. "execution_count": null,
  884. "metadata": {
  885. "id": "W8PoNV9V4u_E"
  886. },
  887. "outputs": [],
  888. "source": [
  889. "def remove_white_spaces(text):\n",
  890. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  891. " return cleaned_text.strip()"
  892. ]
  893. },
  894. {
  895. "cell_type": "code",
  896. "execution_count": null,
  897. "metadata": {
  898. "id": "YD0cvnn74u_E"
  899. },
  900. "outputs": [],
  901. "source": [
  902. "def get_word_only_text(text):\n",
  903. " word_only_text = remove_non_word_chars(text)\n",
  904. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  905. "\n",
  906. " return extra_space_removed_text"
  907. ]
  908. },
  909. {
  910. "cell_type": "code",
  911. "execution_count": null,
  912. "metadata": {
  913. "id": "6OQQDual4u_E"
  914. },
  915. "outputs": [],
  916. "source": [
  917. "def get_texts_cer(reference, model_output):\n",
  918. " # Preprocess input texts to only contain word characters\n",
  919. " word_only_reference = get_word_only_text(reference)\n",
  920. " word_only_output = get_word_only_text(model_output)\n",
  921. "\n",
  922. " # Return +infinity for CER if any of the texts is empty\n",
  923. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  924. " return float('inf')\n",
  925. "\n",
  926. " return cer(word_only_reference, word_only_output)"
  927. ]
  928. },
  929. {
  930. "cell_type": "code",
  931. "execution_count": null,
  932. "metadata": {
  933. "id": "ncWQnPdW4u_E"
  934. },
  935. "outputs": [],
  936. "source": [
  937. "def get_avg_cer_of_method(method_outputs, references):\n",
  938. " cers = []\n",
  939. " for idx, o in enumerate(method_outputs):\n",
  940. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  941. " if cer != float('inf'):\n",
  942. " cers.append(cer)\n",
  943. "\n",
  944. " return sum(cers) / len(cers)"
  945. ]
  946. },
  947. {
  948. "cell_type": "markdown",
  949. "source": [
  950. "## Homograph Evaluation"
  951. ],
  952. "metadata": {
  953. "id": "oBgNtpFQDwku"
  954. }
  955. },
  956. {
  957. "cell_type": "code",
  958. "source": [
  959. "def get_homograph_performance(outputs, references):\n",
  960. " corrects = 0\n",
  961. " total = 0\n",
  962. "\n",
  963. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  964. " if homograph != '':\n",
  965. " total += 1\n",
  966. " if right in outputs[idx]:\n",
  967. " corrects += 1\n",
  968. "\n",
  969. " return corrects / total"
  970. ],
  971. "metadata": {
  972. "id": "J445ULEvEEDn"
  973. },
  974. "execution_count": null,
  975. "outputs": []
  976. },
  977. {
  978. "cell_type": "markdown",
  979. "source": [
  980. "# Full bench"
  981. ],
  982. "metadata": {
  983. "id": "JGEUIrbi9kNH"
  984. }
  985. },
  986. {
  987. "cell_type": "code",
  988. "source": [
  989. "benchmark = []\n",
  990. "\n",
  991. "for g, p in mana_evaluation_data:\n",
  992. " benchmark.append((g, p, '', ''))\n",
  993. "\n",
  994. "for g, p in commonvoice_evaluation_data:\n",
  995. " benchmark.append((g, p, '', ''))\n",
  996. "\n",
  997. "for g, p, w, r in homograph_evaluation_data:\n",
  998. " benchmark.append((g, p, w, r))\n",
  999. "\n",
  1000. "benchmark = benchmark[:400]"
  1001. ],
  1002. "metadata": {
  1003. "id": "fGzQvL8V9mln"
  1004. },
  1005. "execution_count": null,
  1006. "outputs": []
  1007. },
  1008. {
  1009. "cell_type": "code",
  1010. "source": [
  1011. "def print_all_metrics(predictions):\n",
  1012. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1013. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1014. "\n",
  1015. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1016. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1017. ],
  1018. "metadata": {
  1019. "id": "DpSqE5oPbmAy"
  1020. },
  1021. "execution_count": null,
  1022. "outputs": []
  1023. },
  1024. {
  1025. "cell_type": "markdown",
  1026. "source": [
  1027. "# Epitran"
  1028. ],
  1029. "metadata": {
  1030. "id": "k6XT11uMBnGp"
  1031. }
  1032. },
  1033. {
  1034. "cell_type": "code",
  1035. "source": [
  1036. "import epitran\n",
  1037. "epi = epitran.Epitran('fas-Arab')"
  1038. ],
  1039. "metadata": {
  1040. "id": "A53DAk2_Dakd"
  1041. },
  1042. "execution_count": null,
  1043. "outputs": []
  1044. },
  1045. {
  1046. "cell_type": "code",
  1047. "source": [
  1048. "epi.transliterate(u'دلم میخواست برم ')"
  1049. ],
  1050. "metadata": {
  1051. "colab": {
  1052. "base_uri": "https://localhost:8080/",
  1053. "height": 35
  1054. },
  1055. "id": "L84ue_vWwdZl",
  1056. "outputId": "06e6a744-7fd5-46d2-d0e2-48a2ef9dc133"
  1057. },
  1058. "execution_count": null,
  1059. "outputs": [
  1060. {
  1061. "output_type": "execute_result",
  1062. "data": {
  1063. "text/plain": [
  1064. "'dlm mjxvɒst brm '"
  1065. ],
  1066. "application/vnd.google.colaboratory.intrinsic+json": {
  1067. "type": "string"
  1068. }
  1069. },
  1070. "metadata": {},
  1071. "execution_count": 21
  1072. }
  1073. ]
  1074. },
  1075. {
  1076. "cell_type": "code",
  1077. "source": [
  1078. "replace_phonetic_characters(epi.transliterate(u'دلم میخواست برم '))"
  1079. ],
  1080. "metadata": {
  1081. "id": "I_1WYcyaZyTR",
  1082. "colab": {
  1083. "base_uri": "https://localhost:8080/",
  1084. "height": 35
  1085. },
  1086. "outputId": "927eb9fa-2bd3-44f2-8abd-92d90c9767af"
  1087. },
  1088. "execution_count": null,
  1089. "outputs": [
  1090. {
  1091. "output_type": "execute_result",
  1092. "data": {
  1093. "text/plain": [
  1094. "'dlm mjxvAst brm '"
  1095. ],
  1096. "application/vnd.google.colaboratory.intrinsic+json": {
  1097. "type": "string"
  1098. }
  1099. },
  1100. "metadata": {},
  1101. "execution_count": 22
  1102. }
  1103. ]
  1104. },
  1105. {
  1106. "cell_type": "markdown",
  1107. "source": [
  1108. "# outputs"
  1109. ],
  1110. "metadata": {
  1111. "id": "NLgJTtoCg4m_"
  1112. }
  1113. },
  1114. {
  1115. "cell_type": "code",
  1116. "source": [
  1117. "from tqdm import tqdm\n",
  1118. "import time\n",
  1119. "\n",
  1120. "outputs = []\n",
  1121. "start_time = time.time()\n",
  1122. "\n",
  1123. "for g, p, _, _ in tqdm(benchmark):\n",
  1124. " o = epi.transliterate(g)\n",
  1125. " outputs.append(o)\n",
  1126. "\n",
  1127. "total_time = time.time() - start_time\n",
  1128. "avg_time = total_time / len(benchmark) if len(benchmark) > 0 else 0\n",
  1129. "print(f\"Total: {total_time:.2f}s | Avg: {avg_time:.4f}s/sample\")"
  1130. ],
  1131. "metadata": {
  1132. "colab": {
  1133. "base_uri": "https://localhost:8080/"
  1134. },
  1135. "id": "ECW_8Ja5g7FY",
  1136. "outputId": "2c778f9b-7957-4b6f-9116-08681762d1e8"
  1137. },
  1138. "execution_count": null,
  1139. "outputs": [
  1140. {
  1141. "output_type": "stream",
  1142. "name": "stderr",
  1143. "text": [
  1144. "100%|██████████| 400/400 [00:00<00:00, 3625.70it/s]"
  1145. ]
  1146. },
  1147. {
  1148. "output_type": "stream",
  1149. "name": "stdout",
  1150. "text": [
  1151. "Total: 0.12s | Avg: 0.0003s/sample\n"
  1152. ]
  1153. },
  1154. {
  1155. "output_type": "stream",
  1156. "name": "stderr",
  1157. "text": [
  1158. "\n"
  1159. ]
  1160. }
  1161. ]
  1162. },
  1163. {
  1164. "cell_type": "code",
  1165. "source": [
  1166. "mapped_outputs = []\n",
  1167. "for o in outputs:\n",
  1168. " mapped = replace_phonetic_characters(o)\n",
  1169. " mapped_outputs.append(mapped)\n",
  1170. " mapped.replace('j', 'y')"
  1171. ],
  1172. "metadata": {
  1173. "id": "K-catlB6Esuf"
  1174. },
  1175. "execution_count": null,
  1176. "outputs": []
  1177. },
  1178. {
  1179. "cell_type": "code",
  1180. "source": [
  1181. "print_all_metrics(mapped_outputs)\n",
  1182. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1183. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)\")"
  1184. ],
  1185. "metadata": {
  1186. "colab": {
  1187. "base_uri": "https://localhost:8080/"
  1188. },
  1189. "id": "H2taHCPWCnls",
  1190. "outputId": "c3e8950a-898b-45ea-bfb3-0ac9f384c296"
  1191. },
  1192. "execution_count": null,
  1193. "outputs": [
  1194. {
  1195. "output_type": "stream",
  1196. "name": "stdout",
  1197. "text": [
  1198. "PER: \t\t\t45.1223\n",
  1199. "HOMOGRAPH: \t\t0.0000\n",
  1200. "TOTAL TIME:\t\t0.1184 (s)\n",
  1201. "AVG TIME:\t\t0.0003 (s)\n"
  1202. ]
  1203. }
  1204. ]
  1205. },
  1206. {
  1207. "cell_type": "markdown",
  1208. "source": [
  1209. "# Runs\n",
  1210. "\n",
  1211. "## First:\n",
  1212. "\n",
  1213. "```\n",
  1214. "PER: \t\t\t45.1223\n",
  1215. "HOMOGRAPH: \t\t0.0000\n",
  1216. "TOTAL TIME:\t\t0.1172 (s)\n",
  1217. "AVG TIME:\t\t0.0003 (s)\n",
  1218. "```\n",
  1219. "\n",
  1220. "## Second\n",
  1221. "\n",
  1222. "```\n",
  1223. "PER: \t\t\t45.1223\n",
  1224. "HOMOGRAPH: \t\t0.0000\n",
  1225. "TOTAL TIME:\t\t0.1074 (s)\n",
  1226. "AVG TIME:\t\t0.0003 (s)\n",
  1227. "```\n",
  1228. "\n",
  1229. "## Third\n",
  1230. "\n",
  1231. "```\n",
  1232. "PER: \t\t\t45.1223\n",
  1233. "HOMOGRAPH: \t\t0.0000\n",
  1234. "TOTAL TIME:\t\t0.1296 (s)\n",
  1235. "AVG TIME:\t\t0.0003 (s)\n",
  1236. "```\n",
  1237. "\n",
  1238. "## Fourth\n",
  1239. "\n",
  1240. "```\n",
  1241. "PER: \t\t\t45.1223\n",
  1242. "HOMOGRAPH: \t\t0.0000\n",
  1243. "TOTAL TIME:\t\t0.1085 (s)\n",
  1244. "AVG TIME:\t\t0.0003 (s)\n",
  1245. "```\n",
  1246. "\n",
  1247. "## Fifth\n",
  1248. "\n",
  1249. "```\n",
  1250. "PER: \t\t\t45.1223\n",
  1251. "HOMOGRAPH: \t\t0.0000\n",
  1252. "TOTAL TIME:\t\t0.1184 (s)\n",
  1253. "AVG TIME:\t\t0.0003 (s)\n",
  1254. "```"
  1255. ],
  1256. "metadata": {
  1257. "id": "dq7_g71Wivog"
  1258. }
  1259. }
  1260. ]
  1261. }