Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_de_mh_persian_phonemizer.ipynb 73KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": []
  7. },
  8. "kernelspec": {
  9. "name": "python3",
  10. "display_name": "Python 3"
  11. },
  12. "language_info": {
  13. "name": "python"
  14. }
  15. },
  16. "cells": [
  17. {
  18. "cell_type": "markdown",
  19. "metadata": {
  20. "id": "WEY5MiKLzurH"
  21. },
  22. "source": [
  23. "# Setup Environment"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "source": [
  29. "! pip install hazm==0.10.0"
  30. ],
  31. "metadata": {
  32. "colab": {
  33. "base_uri": "https://localhost:8080/",
  34. "height": 1000
  35. },
  36. "id": "QtQ-huXYtepn",
  37. "outputId": "7755d71f-82c4-4a8f-dd14-919767ccd82a"
  38. },
  39. "execution_count": null,
  40. "outputs": [
  41. {
  42. "output_type": "stream",
  43. "name": "stdout",
  44. "text": [
  45. "Collecting hazm==0.10.0\n",
  46. " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
  47. "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n",
  48. " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
  49. "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n",
  50. " Downloading flashtext-2.7.tar.gz (14 kB)\n",
  51. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  52. "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n",
  53. " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
  54. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
  55. "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
  56. " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
  57. "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n",
  58. " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
  59. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
  60. "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n",
  61. " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
  62. "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
  63. "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n",
  64. " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
  65. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  66. "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
  67. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
  68. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.4.2)\n",
  69. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
  70. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
  71. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
  72. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
  73. "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
  74. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  75. "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  76. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  77. "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
  78. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m77.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  79. "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
  80. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  81. "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
  82. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m37.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  83. "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
  84. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  85. "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
  86. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  87. "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
  88. " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  89. " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=ced3adb209606d785c23d09dcadd043fd23f7c63a88a04dc1a3922017818b447\n",
  90. " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
  91. "Successfully built flashtext\n",
  92. "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n",
  93. " Attempting uninstall: numpy\n",
  94. " Found existing installation: numpy 2.0.2\n",
  95. " Uninstalling numpy-2.0.2:\n",
  96. " Successfully uninstalled numpy-2.0.2\n",
  97. " Attempting uninstall: scipy\n",
  98. " Found existing installation: scipy 1.15.2\n",
  99. " Uninstalling scipy-1.15.2:\n",
  100. " Successfully uninstalled scipy-1.15.2\n",
  101. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  102. "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
  103. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
  104. "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
  105. "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
  106. "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  107. "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  108. "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
  109. "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
  110. "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  111. "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
  112. ]
  113. },
  114. {
  115. "output_type": "display_data",
  116. "data": {
  117. "application/vnd.colab-display-data+json": {
  118. "pip_warning": {
  119. "packages": [
  120. "numpy"
  121. ]
  122. },
  123. "id": "954f89a9223948ee9ece2a3af9016355"
  124. }
  125. },
  126. "metadata": {}
  127. }
  128. ]
  129. },
  130. {
  131. "cell_type": "code",
  132. "source": [
  133. "! pip install persian_phonemizer"
  134. ],
  135. "metadata": {
  136. "colab": {
  137. "base_uri": "https://localhost:8080/"
  138. },
  139. "id": "JJ9VISNqZS6m",
  140. "outputId": "56e5c328-ecda-429f-e8b7-d9044dae080a"
  141. },
  142. "execution_count": null,
  143. "outputs": [
  144. {
  145. "output_type": "stream",
  146. "name": "stdout",
  147. "text": [
  148. "Collecting persian_phonemizer\n",
  149. " Downloading persian_phonemizer-0.4.0-py3-none-any.whl.metadata (3.1 kB)\n",
  150. "Requirement already satisfied: spacy>=3.3 in /usr/local/lib/python3.11/dist-packages (from persian_phonemizer) (3.8.5)\n",
  151. "Collecting g2p-fa (from persian_phonemizer)\n",
  152. " Downloading g2p_fa-1.1.0-py3-none-any.whl.metadata (2.7 kB)\n",
  153. "Requirement already satisfied: hazm in /usr/local/lib/python3.11/dist-packages (from persian_phonemizer) (0.10.0)\n",
  154. "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (3.0.12)\n",
  155. "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (1.0.5)\n",
  156. "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (1.0.12)\n",
  157. "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (2.0.11)\n",
  158. "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (3.0.9)\n",
  159. "Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (8.3.6)\n",
  160. "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (1.1.3)\n",
  161. "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (2.5.1)\n",
  162. "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (2.0.10)\n",
  163. "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (0.4.1)\n",
  164. "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (0.15.3)\n",
  165. "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (4.67.1)\n",
  166. "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (1.24.3)\n",
  167. "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (2.32.3)\n",
  168. "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (2.11.4)\n",
  169. "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (3.1.6)\n",
  170. "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (75.2.0)\n",
  171. "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (24.2)\n",
  172. "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.3->persian_phonemizer) (3.5.0)\n",
  173. "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (from g2p-fa->persian_phonemizer) (2.6.0+cu124)\n",
  174. "Requirement already satisfied: fasttext-wheel<0.10.0,>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (0.9.2)\n",
  175. "Requirement already satisfied: flashtext<3.0,>=2.7 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (2.7)\n",
  176. "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (4.3.3)\n",
  177. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (3.9.1)\n",
  178. "Requirement already satisfied: python-crfsuite<0.10.0,>=0.9.9 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (0.9.11)\n",
  179. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm->persian_phonemizer) (1.6.1)\n",
  180. "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm->persian_phonemizer) (2.13.6)\n",
  181. "Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm->persian_phonemizer) (1.13.1)\n",
  182. "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm->persian_phonemizer) (7.1.0)\n",
  183. "Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.11/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy>=3.3->persian_phonemizer) (1.3.0)\n",
  184. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->persian_phonemizer) (8.1.8)\n",
  185. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->persian_phonemizer) (1.4.2)\n",
  186. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm->persian_phonemizer) (2024.11.6)\n",
  187. "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.3->persian_phonemizer) (0.7.0)\n",
  188. "Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.3->persian_phonemizer) (2.33.2)\n",
  189. "Requirement already satisfied: typing-extensions>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.3->persian_phonemizer) (4.13.2)\n",
  190. "Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.3->persian_phonemizer) (0.4.0)\n",
  191. "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.3->persian_phonemizer) (3.4.1)\n",
  192. "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.3->persian_phonemizer) (3.10)\n",
  193. "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.3->persian_phonemizer) (2.4.0)\n",
  194. "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.3->persian_phonemizer) (2025.4.26)\n",
  195. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm->persian_phonemizer) (3.6.0)\n",
  196. "Requirement already satisfied: blis<1.4.0,>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy>=3.3->persian_phonemizer) (1.3.0)\n",
  197. "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy>=3.3->persian_phonemizer) (0.1.5)\n",
  198. "INFO: pip is looking at multiple versions of thinc to determine which version is compatible with other requirements. This could take a while.\n",
  199. "Collecting thinc<8.4.0,>=8.3.4 (from spacy>=3.3->persian_phonemizer)\n",
  200. " Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n",
  201. "Collecting blis<1.3.0,>=1.2.0 (from thinc<8.4.0,>=8.3.4->spacy>=3.3->persian_phonemizer)\n",
  202. " Downloading blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
  203. "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.3->persian_phonemizer) (1.5.4)\n",
  204. "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.3->persian_phonemizer) (13.9.4)\n",
  205. "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3.3->persian_phonemizer) (0.21.0)\n",
  206. "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->spacy>=3.3->persian_phonemizer) (3.0.2)\n",
  207. "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (3.18.0)\n",
  208. "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (3.4.2)\n",
  209. "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (2025.3.2)\n",
  210. "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->g2p-fa->persian_phonemizer)\n",
  211. " Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  212. "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->g2p-fa->persian_phonemizer)\n",
  213. " Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  214. "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->g2p-fa->persian_phonemizer)\n",
  215. " Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
  216. "Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->g2p-fa->persian_phonemizer)\n",
  217. " Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
  218. "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->g2p-fa->persian_phonemizer)\n",
  219. " Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  220. "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->g2p-fa->persian_phonemizer)\n",
  221. " Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  222. "Collecting nvidia-curand-cu12==10.3.5.147 (from torch->g2p-fa->persian_phonemizer)\n",
  223. " Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  224. "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch->g2p-fa->persian_phonemizer)\n",
  225. " Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
  226. "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch->g2p-fa->persian_phonemizer)\n",
  227. " Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
  228. "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (0.6.2)\n",
  229. "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (2.21.5)\n",
  230. "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (12.4.127)\n",
  231. "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch->g2p-fa->persian_phonemizer)\n",
  232. " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
  233. "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (3.2.0)\n",
  234. "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch->g2p-fa->persian_phonemizer) (1.13.1)\n",
  235. "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch->g2p-fa->persian_phonemizer) (1.3.0)\n",
  236. "Requirement already satisfied: marisa-trie>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy>=3.3->persian_phonemizer) (1.2.1)\n",
  237. "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.3->persian_phonemizer) (3.0.0)\n",
  238. "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.3->persian_phonemizer) (2.19.1)\n",
  239. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm->persian_phonemizer) (1.17.2)\n",
  240. "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.3->persian_phonemizer) (0.1.2)\n",
  241. "Downloading persian_phonemizer-0.4.0-py3-none-any.whl (7.2 MB)\n",
  242. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  243. "\u001b[?25hDownloading g2p_fa-1.1.0-py3-none-any.whl (6.2 MB)\n",
  244. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m104.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  245. "\u001b[?25hDownloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n",
  246. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m98.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  247. "\u001b[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n",
  248. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  249. "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n",
  250. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m104.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  251. "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n",
  252. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m80.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  253. "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n",
  254. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  255. "\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n",
  256. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  257. "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n",
  258. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  259. "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n",
  260. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  261. "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n",
  262. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  263. "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n",
  264. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  265. "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
  266. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m90.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  267. "\u001b[?25hDownloading blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)\n",
  268. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.7/11.7 MB\u001b[0m \u001b[31m112.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  269. "\u001b[?25hInstalling collected packages: nvidia-nvjitlink-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, blis, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, thinc, g2p-fa, persian_phonemizer\n",
  270. " Attempting uninstall: nvidia-nvjitlink-cu12\n",
  271. " Found existing installation: nvidia-nvjitlink-cu12 12.5.82\n",
  272. " Uninstalling nvidia-nvjitlink-cu12-12.5.82:\n",
  273. " Successfully uninstalled nvidia-nvjitlink-cu12-12.5.82\n",
  274. " Attempting uninstall: nvidia-curand-cu12\n",
  275. " Found existing installation: nvidia-curand-cu12 10.3.6.82\n",
  276. " Uninstalling nvidia-curand-cu12-10.3.6.82:\n",
  277. " Successfully uninstalled nvidia-curand-cu12-10.3.6.82\n",
  278. " Attempting uninstall: nvidia-cufft-cu12\n",
  279. " Found existing installation: nvidia-cufft-cu12 11.2.3.61\n",
  280. " Uninstalling nvidia-cufft-cu12-11.2.3.61:\n",
  281. " Successfully uninstalled nvidia-cufft-cu12-11.2.3.61\n",
  282. " Attempting uninstall: nvidia-cuda-runtime-cu12\n",
  283. " Found existing installation: nvidia-cuda-runtime-cu12 12.5.82\n",
  284. " Uninstalling nvidia-cuda-runtime-cu12-12.5.82:\n",
  285. " Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82\n",
  286. " Attempting uninstall: nvidia-cuda-nvrtc-cu12\n",
  287. " Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82\n",
  288. " Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:\n",
  289. " Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82\n",
  290. " Attempting uninstall: nvidia-cuda-cupti-cu12\n",
  291. " Found existing installation: nvidia-cuda-cupti-cu12 12.5.82\n",
  292. " Uninstalling nvidia-cuda-cupti-cu12-12.5.82:\n",
  293. " Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82\n",
  294. " Attempting uninstall: nvidia-cublas-cu12\n",
  295. " Found existing installation: nvidia-cublas-cu12 12.5.3.2\n",
  296. " Uninstalling nvidia-cublas-cu12-12.5.3.2:\n",
  297. " Successfully uninstalled nvidia-cublas-cu12-12.5.3.2\n",
  298. " Attempting uninstall: blis\n",
  299. " Found existing installation: blis 1.3.0\n",
  300. " Uninstalling blis-1.3.0:\n",
  301. " Successfully uninstalled blis-1.3.0\n",
  302. " Attempting uninstall: nvidia-cusparse-cu12\n",
  303. " Found existing installation: nvidia-cusparse-cu12 12.5.1.3\n",
  304. " Uninstalling nvidia-cusparse-cu12-12.5.1.3:\n",
  305. " Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3\n",
  306. " Attempting uninstall: nvidia-cudnn-cu12\n",
  307. " Found existing installation: nvidia-cudnn-cu12 9.3.0.75\n",
  308. " Uninstalling nvidia-cudnn-cu12-9.3.0.75:\n",
  309. " Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75\n",
  310. " Attempting uninstall: nvidia-cusolver-cu12\n",
  311. " Found existing installation: nvidia-cusolver-cu12 11.6.3.83\n",
  312. " Uninstalling nvidia-cusolver-cu12-11.6.3.83:\n",
  313. " Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83\n",
  314. " Attempting uninstall: thinc\n",
  315. " Found existing installation: thinc 8.3.6\n",
  316. " Uninstalling thinc-8.3.6:\n",
  317. " Successfully uninstalled thinc-8.3.6\n",
  318. "Successfully installed blis-1.2.1 g2p-fa-1.1.0 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127 persian_phonemizer-0.4.0 thinc-8.3.4\n"
  319. ]
  320. }
  321. ]
  322. },
  323. {
  324. "cell_type": "code",
  325. "source": [
  326. "! pip install jiwer"
  327. ],
  328. "metadata": {
  329. "colab": {
  330. "base_uri": "https://localhost:8080/"
  331. },
  332. "id": "5OD5ZtfTck77",
  333. "outputId": "775bc630-e1d3-49d1-9e0f-62f96dc65ee5"
  334. },
  335. "execution_count": null,
  336. "outputs": [
  337. {
  338. "output_type": "stream",
  339. "name": "stdout",
  340. "text": [
  341. "Collecting jiwer\n",
  342. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  343. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  344. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  345. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  346. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  347. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  348. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m35.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  349. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  350. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  351. ]
  352. }
  353. ]
  354. },
  355. {
  356. "cell_type": "code",
  357. "source": [
  358. "import os\n",
  359. "import re\n",
  360. "from tqdm import tqdm\n",
  361. "import csv\n",
  362. "import pandas as pd\n",
  363. "import json\n",
  364. "import itertools\n",
  365. "from jiwer import cer"
  366. ],
  367. "metadata": {
  368. "id": "DLEJ-KSecpyV"
  369. },
  370. "execution_count": null,
  371. "outputs": []
  372. },
  373. {
  374. "cell_type": "code",
  375. "source": [
  376. "from persian_phonemizer import Phonemizer\n",
  377. "phonemizer = Phonemizer()\n",
  378. "phonemizer.phonemize('دلم میخواست برم ')"
  379. ],
  380. "metadata": {
  381. "colab": {
  382. "base_uri": "https://localhost:8080/",
  383. "height": 35
  384. },
  385. "id": "my5HlSvaaz6h",
  386. "outputId": "6f4c048f-412b-4940-97a7-6a6eb284881a"
  387. },
  388. "execution_count": null,
  389. "outputs": [
  390. {
  391. "output_type": "execute_result",
  392. "data": {
  393. "text/plain": [
  394. "'dele mæʒ miːxævɒːstʰ bæɾm'"
  395. ],
  396. "application/vnd.google.colaboratory.intrinsic+json": {
  397. "type": "string"
  398. }
  399. },
  400. "metadata": {},
  401. "execution_count": 4
  402. }
  403. ]
  404. },
  405. {
  406. "cell_type": "markdown",
  407. "source": [
  408. "# mapping"
  409. ],
  410. "metadata": {
  411. "id": "VtxEYym69RUH"
  412. }
  413. },
  414. {
  415. "cell_type": "code",
  416. "source": [
  417. "output_to_phonetics_map = {\n",
  418. " 'м': 'm',\n",
  419. " 'ʷ':' v',\n",
  420. " 'c': 'k',\n",
  421. " 'ĉ': 'C',\n",
  422. " 'č': 'C',\n",
  423. " '̕': \"?\",\n",
  424. " \"'\": '?',\n",
  425. " 'ʔ': \"?\",\n",
  426. " 'ꞌ': \"?\",\n",
  427. " '̛': \"?\",\n",
  428. " '’': \"?\",\n",
  429. " 'ʼ': \"?\",\n",
  430. " \"'\": '?',\n",
  431. " 'â': 'A',\n",
  432. " 'â': 'A',\n",
  433. " 'ȃ': 'A',\n",
  434. " 'ž': 'Z',\n",
  435. " 'š': 'S',\n",
  436. " 'W': 'v',\n",
  437. " 'β': 'f',\n",
  438. " 'е': 'e',\n",
  439. " '`': \"?\",\n",
  440. " 'ɑ': 'A',\n",
  441. " 'ɑ': 'A',\n",
  442. " 'ʃ': 'S',\n",
  443. " 'ð': 'z',\n",
  444. " 'ɾ': 'r',\n",
  445. " 'æ': 'a',\n",
  446. " 'ɪ': 'e',\n",
  447. " 'χ': 'x',\n",
  448. " 'ɣ': 'q',\n",
  449. " 'ʒ': 'Z',\n",
  450. " ':': '',\n",
  451. " 'ː': '',\n",
  452. " 'ā': 'A',\n",
  453. " 'ː': '',\n",
  454. " 'ä': 'A',\n",
  455. " 'á': 'A',\n",
  456. " 'š': 'S',\n",
  457. " 'ū': 'u',\n",
  458. " 'û': 'u',\n",
  459. " 'ś': 's',\n",
  460. " 'ī': 'i',\n",
  461. " 'í': 'i',\n",
  462. " 'î': 'i',\n",
  463. " 'é': 'e',\n",
  464. " 'ḥ': 'h',\n",
  465. " 'ɒ': 'A',\n",
  466. " 'ʰ': '',\n",
  467. " 'ə': 'e',\n",
  468. " 'R': 'r',\n",
  469. " 'W': 'v',\n",
  470. " 'Q': 'q',\n",
  471. " 'T': 't',\n",
  472. " 'Y': 'y',\n",
  473. " 'P': 'p',\n",
  474. " 'D': 'd',\n",
  475. " 'F': 'f',\n",
  476. " 'H': 'h',\n",
  477. " 'J': 'j',\n",
  478. " 'L': 'l',\n",
  479. " 'X': 'x',\n",
  480. " 'V': 'v',\n",
  481. " 'B': 'b',\n",
  482. " 'N': 'n',\n",
  483. " 'M': 'm',\n",
  484. " 'K': 'k',\n",
  485. " 'G': 'g',\n",
  486. " 'U': 'u',\n",
  487. " 'O': 'o',\n",
  488. " 'I': 'i',\n",
  489. " 'E': 'e',\n",
  490. " 'ŋ': 'ng',\n",
  491. " '.': '',\n",
  492. " 'ɛ': 'e',\n",
  493. " 'ʊ': 'u',\n",
  494. " \"ˈ\": '?',\n",
  495. " 'ù': 'u',\n",
  496. " 'θ': 's',\n",
  497. " '̪': '',\n",
  498. " 'ũ': 'u',\n",
  499. " '_': '',\n",
  500. " 'ç': 'C',\n",
  501. " 'ĝ': 'q',\n",
  502. " 'ɢ': 'q',\n",
  503. " 'ː': '',\n",
  504. " 'í': 'i',\n",
  505. " 'ŝ': 'S',\n",
  506. " '!': '',\n",
  507. " 'ǧ': 'q',\n",
  508. " 'ʻ': '?',\n",
  509. " 'è': 'e',\n",
  510. " '�': '',\n",
  511. " 'ú': 'u',\n",
  512. " 'ô': 'o',\n",
  513. " 'ē': 'e',\n",
  514. " 'à': 'A',\n",
  515. " 'ă': 'A',\n",
  516. " 'ǐ': 'i',\n",
  517. " 'ü': 'u',\n",
  518. " '\\u200e': '',\n",
  519. " 'ğ': 'q',\n",
  520. " 'ṣ': 'S',\n",
  521. " 'â': 'A',\n",
  522. " 'â': 'A',\n",
  523. " 'ȃ': 'A',\n",
  524. " 'ž': 'Z',\n",
  525. " 'š': 'S',\n",
  526. " 'ā': 'A',\n",
  527. " 'ː': '',\n",
  528. " 'ä': 'A',\n",
  529. " 'á': 'A',\n",
  530. " 'š': 'S',\n",
  531. " 'ū': 'u',\n",
  532. " 'û': 'u',\n",
  533. " 'ś': 'S',\n",
  534. " 'ī': 'i',\n",
  535. " 'í': 'i',\n",
  536. " 'î': 'i',\n",
  537. " 'é': 'e',\n",
  538. "}\n",
  539. "\n",
  540. "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n",
  541. "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n",
  542. "\n",
  543. "\n",
  544. "def replace_phonetic_characters(input_string, char_map=output_to_phonetics_map, from_phonetics=False):\n",
  545. " substituted = re.sub(r'tʃʰ', 'C', input_string)\n",
  546. " substituted = re.sub(r't͡ʃ', 'C', input_string)\n",
  547. " substituted = re.sub(r'tʃ', 'C', substituted)\n",
  548. " substituted = re.sub(r't͡S', 'C', substituted)\n",
  549. " substituted = re.sub(r'ow', 'o', substituted)\n",
  550. " substituted = re.sub('d͡ʒ', 'j', substituted)\n",
  551. " substituted = re.sub('dʒ', 'j', substituted)\n",
  552. "\n",
  553. " # Create a translation table using str.maketrans\n",
  554. " translation_table = str.maketrans(char_map)\n",
  555. "\n",
  556. " # Use str.translate to replace characters based on the translation table\n",
  557. " translated = substituted.translate(translation_table)\n",
  558. "\n",
  559. " return translated"
  560. ],
  561. "metadata": {
  562. "id": "TKx8oA1n7rKh"
  563. },
  564. "execution_count": null,
  565. "outputs": []
  566. },
  567. {
  568. "cell_type": "markdown",
  569. "metadata": {
  570. "id": "XjAPkfq7SF87"
  571. },
  572. "source": [
  573. "# Get Evaluation Data"
  574. ]
  575. },
  576. {
  577. "cell_type": "code",
  578. "source": [
  579. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  580. ],
  581. "metadata": {
  582. "id": "qwCG0jX-88nQ",
  583. "colab": {
  584. "base_uri": "https://localhost:8080/"
  585. },
  586. "outputId": "63d62243-fa2f-4241-bc6c-a9d5b24f8980"
  587. },
  588. "execution_count": null,
  589. "outputs": [
  590. {
  591. "output_type": "stream",
  592. "name": "stdout",
  593. "text": [
  594. "--2025-05-10 10:07:38-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  595. "Resolving huggingface.co (huggingface.co)... 18.160.143.76, 18.160.143.99, 18.160.143.75, ...\n",
  596. "Connecting to huggingface.co (huggingface.co)|18.160.143.76|:443... connected.\n",
  597. "HTTP request sent, awaiting response... 200 OK\n",
  598. "Length: 56026 (55K) [text/plain]\n",
  599. "Saving to: ‘SentenceBench.csv’\n",
  600. "\n",
  601. "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.01s \n",
  602. "\n",
  603. "2025-05-10 10:07:38 (4.42 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  604. "\n"
  605. ]
  606. }
  607. ]
  608. },
  609. {
  610. "cell_type": "code",
  611. "source": [
  612. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  613. ],
  614. "metadata": {
  615. "id": "hJO-UAPDQvcb"
  616. },
  617. "execution_count": null,
  618. "outputs": []
  619. },
  620. {
  621. "cell_type": "code",
  622. "source": [
  623. "sentence_bench.head(3)"
  624. ],
  625. "metadata": {
  626. "colab": {
  627. "base_uri": "https://localhost:8080/",
  628. "height": 143
  629. },
  630. "id": "qlYbrnUa9LAN",
  631. "outputId": "584e795c-1801-4594-be3f-e4426a16925f"
  632. },
  633. "execution_count": null,
  634. "outputs": [
  635. {
  636. "output_type": "execute_result",
  637. "data": {
  638. "text/plain": [
  639. " dataset grapheme \\\n",
  640. "0 homograph من قدر تو را می‌دانم \n",
  641. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  642. "2 homograph به دست و صورتم کرم زدم \n",
  643. "\n",
  644. " phoneme homograph word \\\n",
  645. "0 man qadr-e to rA mi-dAnam قدر \n",
  646. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  647. "2 be dast-o suratam kerem zadam کرم \n",
  648. "\n",
  649. " pronunciation \n",
  650. "0 qadr \n",
  651. "1 qadar \n",
  652. "2 kerem "
  653. ],
  654. "text/html": [
  655. "\n",
  656. " <div id=\"df-75a41682-7a05-40b4-8ffb-951442296f9c\" class=\"colab-df-container\">\n",
  657. " <div>\n",
  658. "<style scoped>\n",
  659. " .dataframe tbody tr th:only-of-type {\n",
  660. " vertical-align: middle;\n",
  661. " }\n",
  662. "\n",
  663. " .dataframe tbody tr th {\n",
  664. " vertical-align: top;\n",
  665. " }\n",
  666. "\n",
  667. " .dataframe thead th {\n",
  668. " text-align: right;\n",
  669. " }\n",
  670. "</style>\n",
  671. "<table border=\"1\" class=\"dataframe\">\n",
  672. " <thead>\n",
  673. " <tr style=\"text-align: right;\">\n",
  674. " <th></th>\n",
  675. " <th>dataset</th>\n",
  676. " <th>grapheme</th>\n",
  677. " <th>phoneme</th>\n",
  678. " <th>homograph word</th>\n",
  679. " <th>pronunciation</th>\n",
  680. " </tr>\n",
  681. " </thead>\n",
  682. " <tbody>\n",
  683. " <tr>\n",
  684. " <th>0</th>\n",
  685. " <td>homograph</td>\n",
  686. " <td>من قدر تو را می‌دانم</td>\n",
  687. " <td>man qadr-e to rA mi-dAnam</td>\n",
  688. " <td>قدر</td>\n",
  689. " <td>qadr</td>\n",
  690. " </tr>\n",
  691. " <tr>\n",
  692. " <th>1</th>\n",
  693. " <td>homograph</td>\n",
  694. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  695. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  696. " <td>قدر</td>\n",
  697. " <td>qadar</td>\n",
  698. " </tr>\n",
  699. " <tr>\n",
  700. " <th>2</th>\n",
  701. " <td>homograph</td>\n",
  702. " <td>به دست و صورتم کرم زدم</td>\n",
  703. " <td>be dast-o suratam kerem zadam</td>\n",
  704. " <td>کرم</td>\n",
  705. " <td>kerem</td>\n",
  706. " </tr>\n",
  707. " </tbody>\n",
  708. "</table>\n",
  709. "</div>\n",
  710. " <div class=\"colab-df-buttons\">\n",
  711. "\n",
  712. " <div class=\"colab-df-container\">\n",
  713. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-75a41682-7a05-40b4-8ffb-951442296f9c')\"\n",
  714. " title=\"Convert this dataframe to an interactive table.\"\n",
  715. " style=\"display:none;\">\n",
  716. "\n",
  717. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  718. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  719. " </svg>\n",
  720. " </button>\n",
  721. "\n",
  722. " <style>\n",
  723. " .colab-df-container {\n",
  724. " display:flex;\n",
  725. " gap: 12px;\n",
  726. " }\n",
  727. "\n",
  728. " .colab-df-convert {\n",
  729. " background-color: #E8F0FE;\n",
  730. " border: none;\n",
  731. " border-radius: 50%;\n",
  732. " cursor: pointer;\n",
  733. " display: none;\n",
  734. " fill: #1967D2;\n",
  735. " height: 32px;\n",
  736. " padding: 0 0 0 0;\n",
  737. " width: 32px;\n",
  738. " }\n",
  739. "\n",
  740. " .colab-df-convert:hover {\n",
  741. " background-color: #E2EBFA;\n",
  742. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  743. " fill: #174EA6;\n",
  744. " }\n",
  745. "\n",
  746. " .colab-df-buttons div {\n",
  747. " margin-bottom: 4px;\n",
  748. " }\n",
  749. "\n",
  750. " [theme=dark] .colab-df-convert {\n",
  751. " background-color: #3B4455;\n",
  752. " fill: #D2E3FC;\n",
  753. " }\n",
  754. "\n",
  755. " [theme=dark] .colab-df-convert:hover {\n",
  756. " background-color: #434B5C;\n",
  757. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  758. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  759. " fill: #FFFFFF;\n",
  760. " }\n",
  761. " </style>\n",
  762. "\n",
  763. " <script>\n",
  764. " const buttonEl =\n",
  765. " document.querySelector('#df-75a41682-7a05-40b4-8ffb-951442296f9c button.colab-df-convert');\n",
  766. " buttonEl.style.display =\n",
  767. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  768. "\n",
  769. " async function convertToInteractive(key) {\n",
  770. " const element = document.querySelector('#df-75a41682-7a05-40b4-8ffb-951442296f9c');\n",
  771. " const dataTable =\n",
  772. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  773. " [key], {});\n",
  774. " if (!dataTable) return;\n",
  775. "\n",
  776. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  777. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  778. " + ' to learn more about interactive tables.';\n",
  779. " element.innerHTML = '';\n",
  780. " dataTable['output_type'] = 'display_data';\n",
  781. " await google.colab.output.renderOutput(dataTable, element);\n",
  782. " const docLink = document.createElement('div');\n",
  783. " docLink.innerHTML = docLinkHtml;\n",
  784. " element.appendChild(docLink);\n",
  785. " }\n",
  786. " </script>\n",
  787. " </div>\n",
  788. "\n",
  789. "\n",
  790. " <div id=\"df-f541734f-766a-44ba-88c6-1b4f4c4c57b7\">\n",
  791. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-f541734f-766a-44ba-88c6-1b4f4c4c57b7')\"\n",
  792. " title=\"Suggest charts\"\n",
  793. " style=\"display:none;\">\n",
  794. "\n",
  795. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  796. " width=\"24px\">\n",
  797. " <g>\n",
  798. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  799. " </g>\n",
  800. "</svg>\n",
  801. " </button>\n",
  802. "\n",
  803. "<style>\n",
  804. " .colab-df-quickchart {\n",
  805. " --bg-color: #E8F0FE;\n",
  806. " --fill-color: #1967D2;\n",
  807. " --hover-bg-color: #E2EBFA;\n",
  808. " --hover-fill-color: #174EA6;\n",
  809. " --disabled-fill-color: #AAA;\n",
  810. " --disabled-bg-color: #DDD;\n",
  811. " }\n",
  812. "\n",
  813. " [theme=dark] .colab-df-quickchart {\n",
  814. " --bg-color: #3B4455;\n",
  815. " --fill-color: #D2E3FC;\n",
  816. " --hover-bg-color: #434B5C;\n",
  817. " --hover-fill-color: #FFFFFF;\n",
  818. " --disabled-bg-color: #3B4455;\n",
  819. " --disabled-fill-color: #666;\n",
  820. " }\n",
  821. "\n",
  822. " .colab-df-quickchart {\n",
  823. " background-color: var(--bg-color);\n",
  824. " border: none;\n",
  825. " border-radius: 50%;\n",
  826. " cursor: pointer;\n",
  827. " display: none;\n",
  828. " fill: var(--fill-color);\n",
  829. " height: 32px;\n",
  830. " padding: 0;\n",
  831. " width: 32px;\n",
  832. " }\n",
  833. "\n",
  834. " .colab-df-quickchart:hover {\n",
  835. " background-color: var(--hover-bg-color);\n",
  836. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  837. " fill: var(--button-hover-fill-color);\n",
  838. " }\n",
  839. "\n",
  840. " .colab-df-quickchart-complete:disabled,\n",
  841. " .colab-df-quickchart-complete:disabled:hover {\n",
  842. " background-color: var(--disabled-bg-color);\n",
  843. " fill: var(--disabled-fill-color);\n",
  844. " box-shadow: none;\n",
  845. " }\n",
  846. "\n",
  847. " .colab-df-spinner {\n",
  848. " border: 2px solid var(--fill-color);\n",
  849. " border-color: transparent;\n",
  850. " border-bottom-color: var(--fill-color);\n",
  851. " animation:\n",
  852. " spin 1s steps(1) infinite;\n",
  853. " }\n",
  854. "\n",
  855. " @keyframes spin {\n",
  856. " 0% {\n",
  857. " border-color: transparent;\n",
  858. " border-bottom-color: var(--fill-color);\n",
  859. " border-left-color: var(--fill-color);\n",
  860. " }\n",
  861. " 20% {\n",
  862. " border-color: transparent;\n",
  863. " border-left-color: var(--fill-color);\n",
  864. " border-top-color: var(--fill-color);\n",
  865. " }\n",
  866. " 30% {\n",
  867. " border-color: transparent;\n",
  868. " border-left-color: var(--fill-color);\n",
  869. " border-top-color: var(--fill-color);\n",
  870. " border-right-color: var(--fill-color);\n",
  871. " }\n",
  872. " 40% {\n",
  873. " border-color: transparent;\n",
  874. " border-right-color: var(--fill-color);\n",
  875. " border-top-color: var(--fill-color);\n",
  876. " }\n",
  877. " 60% {\n",
  878. " border-color: transparent;\n",
  879. " border-right-color: var(--fill-color);\n",
  880. " }\n",
  881. " 80% {\n",
  882. " border-color: transparent;\n",
  883. " border-right-color: var(--fill-color);\n",
  884. " border-bottom-color: var(--fill-color);\n",
  885. " }\n",
  886. " 90% {\n",
  887. " border-color: transparent;\n",
  888. " border-bottom-color: var(--fill-color);\n",
  889. " }\n",
  890. " }\n",
  891. "</style>\n",
  892. "\n",
  893. " <script>\n",
  894. " async function quickchart(key) {\n",
  895. " const quickchartButtonEl =\n",
  896. " document.querySelector('#' + key + ' button');\n",
  897. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  898. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  899. " try {\n",
  900. " const charts = await google.colab.kernel.invokeFunction(\n",
  901. " 'suggestCharts', [key], {});\n",
  902. " } catch (error) {\n",
  903. " console.error('Error during call to suggestCharts:', error);\n",
  904. " }\n",
  905. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  906. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  907. " }\n",
  908. " (() => {\n",
  909. " let quickchartButtonEl =\n",
  910. " document.querySelector('#df-f541734f-766a-44ba-88c6-1b4f4c4c57b7 button');\n",
  911. " quickchartButtonEl.style.display =\n",
  912. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  913. " })();\n",
  914. " </script>\n",
  915. " </div>\n",
  916. " </div>\n",
  917. " </div>\n"
  918. ],
  919. "application/vnd.google.colaboratory.intrinsic+json": {
  920. "type": "dataframe",
  921. "variable_name": "sentence_bench",
  922. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  923. }
  924. },
  925. "metadata": {},
  926. "execution_count": 8
  927. }
  928. ]
  929. },
  930. {
  931. "cell_type": "markdown",
  932. "metadata": {
  933. "id": "wDV7ysXf2b_H"
  934. },
  935. "source": [
  936. "### Get ManaTTS"
  937. ]
  938. },
  939. {
  940. "cell_type": "code",
  941. "execution_count": null,
  942. "metadata": {
  943. "colab": {
  944. "base_uri": "https://localhost:8080/"
  945. },
  946. "id": "TcL5ZLvSSnVB",
  947. "outputId": "57eed88d-7ae2-4c1d-fd40-4f8f09e8dd03"
  948. },
  949. "outputs": [
  950. {
  951. "output_type": "execute_result",
  952. "data": {
  953. "text/plain": [
  954. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  955. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  956. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  957. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  958. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  959. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  960. ]
  961. },
  962. "metadata": {},
  963. "execution_count": 9
  964. }
  965. ],
  966. "source": [
  967. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  968. "\n",
  969. "# Convert to a list of tuples\n",
  970. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  971. "\n",
  972. "mana_evaluation_data[:3]"
  973. ]
  974. },
  975. {
  976. "cell_type": "markdown",
  977. "metadata": {
  978. "id": "Jjacw9Mp2eoX"
  979. },
  980. "source": [
  981. "### Get CommonVoice"
  982. ]
  983. },
  984. {
  985. "cell_type": "code",
  986. "execution_count": null,
  987. "metadata": {
  988. "id": "-yQnqCGw26sk",
  989. "colab": {
  990. "base_uri": "https://localhost:8080/"
  991. },
  992. "outputId": "60f4e58c-8529-4d62-88ca-795413c40385"
  993. },
  994. "outputs": [
  995. {
  996. "output_type": "execute_result",
  997. "data": {
  998. "text/plain": [
  999. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  1000. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  1001. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  1002. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  1003. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  1004. ]
  1005. },
  1006. "metadata": {},
  1007. "execution_count": 10
  1008. }
  1009. ],
  1010. "source": [
  1011. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  1012. "\n",
  1013. "# Convert to a list of tuples\n",
  1014. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  1015. "\n",
  1016. "commonvoice_evaluation_data[:3]"
  1017. ]
  1018. },
  1019. {
  1020. "cell_type": "markdown",
  1021. "metadata": {
  1022. "id": "ciSPyhRc3Rvo"
  1023. },
  1024. "source": [
  1025. "### Get Homograph"
  1026. ]
  1027. },
  1028. {
  1029. "cell_type": "code",
  1030. "execution_count": null,
  1031. "metadata": {
  1032. "id": "XlFc5JbN3Rvz",
  1033. "colab": {
  1034. "base_uri": "https://localhost:8080/"
  1035. },
  1036. "outputId": "19aeaf85-6001-4d7f-e46f-e65b05b0a861"
  1037. },
  1038. "outputs": [
  1039. {
  1040. "output_type": "execute_result",
  1041. "data": {
  1042. "text/plain": [
  1043. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  1044. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  1045. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  1046. " 'قدر',\n",
  1047. " 'qadar'),\n",
  1048. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  1049. ]
  1050. },
  1051. "metadata": {},
  1052. "execution_count": 11
  1053. }
  1054. ],
  1055. "source": [
  1056. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  1057. "\n",
  1058. "# Convert to a list of tuples\n",
  1059. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  1060. "\n",
  1061. "homograph_evaluation_data[:3]"
  1062. ]
  1063. },
  1064. {
  1065. "cell_type": "markdown",
  1066. "metadata": {
  1067. "id": "R6PE5ds45TPr"
  1068. },
  1069. "source": [
  1070. "# Evaluate Method Outputs"
  1071. ]
  1072. },
  1073. {
  1074. "cell_type": "markdown",
  1075. "metadata": {
  1076. "id": "CLKaERek4u_D"
  1077. },
  1078. "source": [
  1079. "## PER Evaluation"
  1080. ]
  1081. },
  1082. {
  1083. "cell_type": "code",
  1084. "execution_count": null,
  1085. "metadata": {
  1086. "id": "nBee9xG54u_E"
  1087. },
  1088. "outputs": [],
  1089. "source": [
  1090. "def remove_non_word_chars(text):\n",
  1091. " pattern = r'[^\\w\\s\\?]'\n",
  1092. " cleaned_text = re.sub(pattern, ' ', text)\n",
  1093. " return cleaned_text"
  1094. ]
  1095. },
  1096. {
  1097. "cell_type": "code",
  1098. "execution_count": null,
  1099. "metadata": {
  1100. "id": "W8PoNV9V4u_E"
  1101. },
  1102. "outputs": [],
  1103. "source": [
  1104. "def remove_white_spaces(text):\n",
  1105. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  1106. " return cleaned_text.strip()"
  1107. ]
  1108. },
  1109. {
  1110. "cell_type": "code",
  1111. "execution_count": null,
  1112. "metadata": {
  1113. "id": "YD0cvnn74u_E"
  1114. },
  1115. "outputs": [],
  1116. "source": [
  1117. "def get_word_only_text(text):\n",
  1118. " word_only_text = remove_non_word_chars(text)\n",
  1119. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  1120. "\n",
  1121. " return extra_space_removed_text"
  1122. ]
  1123. },
  1124. {
  1125. "cell_type": "code",
  1126. "execution_count": null,
  1127. "metadata": {
  1128. "id": "6OQQDual4u_E"
  1129. },
  1130. "outputs": [],
  1131. "source": [
  1132. "def get_texts_cer(reference, model_output):\n",
  1133. " # Preprocess input texts to only contain word characters\n",
  1134. " word_only_reference = get_word_only_text(reference)\n",
  1135. " word_only_output = get_word_only_text(model_output)\n",
  1136. "\n",
  1137. " # Return +infinity for CER if any of the texts is empty\n",
  1138. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  1139. " return float('inf')\n",
  1140. "\n",
  1141. " return cer(word_only_reference, word_only_output)"
  1142. ]
  1143. },
  1144. {
  1145. "cell_type": "code",
  1146. "execution_count": null,
  1147. "metadata": {
  1148. "id": "ncWQnPdW4u_E"
  1149. },
  1150. "outputs": [],
  1151. "source": [
  1152. "def get_avg_cer_of_method(method_outputs, references):\n",
  1153. " cers = []\n",
  1154. " for idx, o in enumerate(method_outputs):\n",
  1155. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  1156. " if cer != float('inf'):\n",
  1157. " cers.append(cer)\n",
  1158. "\n",
  1159. " return sum(cers) / len(cers)"
  1160. ]
  1161. },
  1162. {
  1163. "cell_type": "markdown",
  1164. "source": [
  1165. "## Homograph Evaluation"
  1166. ],
  1167. "metadata": {
  1168. "id": "oBgNtpFQDwku"
  1169. }
  1170. },
  1171. {
  1172. "cell_type": "code",
  1173. "source": [
  1174. "def get_homograph_performance(outputs, references):\n",
  1175. " corrects = 0\n",
  1176. " total = 0\n",
  1177. "\n",
  1178. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  1179. " if homograph != '':\n",
  1180. " total += 1\n",
  1181. " if right in outputs[idx]:\n",
  1182. " corrects += 1\n",
  1183. "\n",
  1184. " return corrects / total"
  1185. ],
  1186. "metadata": {
  1187. "id": "J445ULEvEEDn"
  1188. },
  1189. "execution_count": null,
  1190. "outputs": []
  1191. },
  1192. {
  1193. "cell_type": "markdown",
  1194. "source": [
  1195. "# Full bench"
  1196. ],
  1197. "metadata": {
  1198. "id": "JGEUIrbi9kNH"
  1199. }
  1200. },
  1201. {
  1202. "cell_type": "code",
  1203. "source": [
  1204. "benchmark = []\n",
  1205. "\n",
  1206. "for g, p in mana_evaluation_data:\n",
  1207. " benchmark.append((g, p, '', ''))\n",
  1208. "\n",
  1209. "for g, p in commonvoice_evaluation_data:\n",
  1210. " benchmark.append((g, p, '', ''))\n",
  1211. "\n",
  1212. "for g, p, w, r in homograph_evaluation_data:\n",
  1213. " benchmark.append((g, p, w, r))\n",
  1214. "\n",
  1215. "benchmark = benchmark[:400]"
  1216. ],
  1217. "metadata": {
  1218. "id": "fGzQvL8V9mln"
  1219. },
  1220. "execution_count": null,
  1221. "outputs": []
  1222. },
  1223. {
  1224. "cell_type": "code",
  1225. "source": [
  1226. "def print_all_metrics(predictions):\n",
  1227. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1228. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1229. "\n",
  1230. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1231. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1232. ],
  1233. "metadata": {
  1234. "id": "DpSqE5oPbmAy"
  1235. },
  1236. "execution_count": null,
  1237. "outputs": []
  1238. },
  1239. {
  1240. "cell_type": "markdown",
  1241. "source": [
  1242. "# Sentence Inference"
  1243. ],
  1244. "metadata": {
  1245. "id": "_-mKfiwYIMdQ"
  1246. }
  1247. },
  1248. {
  1249. "cell_type": "code",
  1250. "source": [
  1251. "from hazm import WordTokenizer, Normalizer\n",
  1252. "tokenizer = WordTokenizer()\n",
  1253. "normalizer = Normalizer()"
  1254. ],
  1255. "metadata": {
  1256. "id": "PNuCMdIuVPf5"
  1257. },
  1258. "execution_count": null,
  1259. "outputs": []
  1260. },
  1261. {
  1262. "cell_type": "code",
  1263. "source": [
  1264. "def sentence_inference(sent):\n",
  1265. " phonemes = []\n",
  1266. " tokens = tokenizer.tokenize(normalizer.normalize(sent))\n",
  1267. " i = 0\n",
  1268. " while i < len(tokens):\n",
  1269. " subsent = ' '.join(tokens[i:i+3])\n",
  1270. " try:\n",
  1271. " subphoneme = phonemizer.phonemize(subsent)\n",
  1272. " except:\n",
  1273. " subphoneme = ''\n",
  1274. " phonemes.append(subphoneme)\n",
  1275. " i += len(tokens[i:i+3])\n",
  1276. "\n",
  1277. " phoneme = ' '.join(phonemes)\n",
  1278. " return phoneme"
  1279. ],
  1280. "metadata": {
  1281. "id": "rTPVMXEGU_9r"
  1282. },
  1283. "execution_count": null,
  1284. "outputs": []
  1285. },
  1286. {
  1287. "cell_type": "markdown",
  1288. "source": [
  1289. "# outputs"
  1290. ],
  1291. "metadata": {
  1292. "id": "fRaAhTMsMHBJ"
  1293. }
  1294. },
  1295. {
  1296. "cell_type": "code",
  1297. "source": [
  1298. "import time\n",
  1299. "from tqdm import tqdm\n",
  1300. "\n",
  1301. "# Initialize variables\n",
  1302. "outputs = []\n",
  1303. "start_time = time.time() # Start total timer\n",
  1304. "\n",
  1305. "# Process each sample with progress bar\n",
  1306. "for g, p, _, _ in tqdm(benchmark, desc=\"Running inference\"):\n",
  1307. " o = sentence_inference(g)\n",
  1308. " outputs.append(o)\n",
  1309. "\n",
  1310. "# Calculate timing stats\n",
  1311. "total_time = time.time() - start_time\n",
  1312. "avg_time = total_time / len(benchmark) if benchmark else 0"
  1313. ],
  1314. "metadata": {
  1315. "colab": {
  1316. "base_uri": "https://localhost:8080/"
  1317. },
  1318. "outputId": "acc51819-0c98-4eda-a4cc-270f9707e370",
  1319. "id": "PYn9z4GiMHBf"
  1320. },
  1321. "execution_count": null,
  1322. "outputs": [
  1323. {
  1324. "output_type": "stream",
  1325. "name": "stderr",
  1326. "text": [
  1327. "Running inference: 100%|██████████| 400/400 [01:15<00:00, 5.33it/s]\n"
  1328. ]
  1329. }
  1330. ]
  1331. },
  1332. {
  1333. "cell_type": "code",
  1334. "source": [
  1335. "mapped_outputs = []\n",
  1336. "for o in outputs:\n",
  1337. " mapped = replace_phonetic_characters(o)\n",
  1338. " mapped_outputs.append(mapped)"
  1339. ],
  1340. "metadata": {
  1341. "id": "90gAxDT-GMhI"
  1342. },
  1343. "execution_count": null,
  1344. "outputs": []
  1345. },
  1346. {
  1347. "cell_type": "code",
  1348. "source": [
  1349. "print_all_metrics(mapped_outputs)\n",
  1350. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1351. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)\")"
  1352. ],
  1353. "metadata": {
  1354. "id": "zP4Tcj285Ij0",
  1355. "colab": {
  1356. "base_uri": "https://localhost:8080/"
  1357. },
  1358. "outputId": "5986173e-4d6b-4ff6-b18c-b1e72e3afd83"
  1359. },
  1360. "execution_count": null,
  1361. "outputs": [
  1362. {
  1363. "output_type": "stream",
  1364. "name": "stdout",
  1365. "text": [
  1366. "PER: \t\t\t25.2989\n",
  1367. "HOMOGRAPH: \t\t29.2453\n",
  1368. "TOTAL TIME:\t\t75.0611 (s)\n",
  1369. "AVG TIME:\t\t0.1877 (s)\n"
  1370. ]
  1371. }
  1372. ]
  1373. },
  1374. {
  1375. "cell_type": "markdown",
  1376. "source": [
  1377. "# Runs\n",
  1378. "\n",
  1379. "## First:\n",
  1380. "\n",
  1381. "```\n",
  1382. "PER: \t\t\t25.2111\n",
  1383. "HOMOGRAPH: \t\t29.7170\n",
  1384. "TOTAL TIME:\t\t85.6780 (s)\n",
  1385. "AVG TIME:\t\t0.2142 (s)\n",
  1386. "```\n",
  1387. "\n",
  1388. "## Second\n",
  1389. "\n",
  1390. "```\n",
  1391. "PER: \t\t\t25.1991\n",
  1392. "HOMOGRAPH: \t\t29.7170\n",
  1393. "TOTAL TIME:\t\t41.5983 (s)\n",
  1394. "AVG TIME:\t\t0.1040 (s)\n",
  1395. "```\n",
  1396. "\n",
  1397. "## Third\n",
  1398. "\n",
  1399. "```\n",
  1400. "PER: \t\t\t25.2278\n",
  1401. "HOMOGRAPH: \t\t28.7736\n",
  1402. "TOTAL TIME:\t\t78.4948 (s)\n",
  1403. "AVG TIME:\t\t0.1962 (s)\n",
  1404. "```\n",
  1405. "\n",
  1406. "\n",
  1407. "## Fourth\n",
  1408. "\n",
  1409. "```\n",
  1410. "PER: \t\t\t25.4065\n",
  1411. "HOMOGRAPH: \t\t28.7736\n",
  1412. "TOTAL TIME:\t\t79.8478 (s)\n",
  1413. "AVG TIME:\t\t0.1996 (s)\n",
  1414. "```\n",
  1415. "\n",
  1416. "## Fifth\n",
  1417. "\n",
  1418. "```\n",
  1419. "PER: \t\t\t25.2989\n",
  1420. "HOMOGRAPH: \t\t29.2453\n",
  1421. "TOTAL TIME:\t\t75.0611 (s)\n",
  1422. "AVG TIME:\t\t0.1877 (s)\n",
  1423. "```"
  1424. ],
  1425. "metadata": {
  1426. "id": "5YPY-B2DjHW7"
  1427. }
  1428. }
  1429. ]
  1430. }