Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_Homo_GE2PE.ipynb 51KB


  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {
  6. "id": "9sEfZoepGP8x"
  7. },
  8. "source": [
  9. "# Setup Environment"
  10. ]
  11. },
  12. {
  13. "cell_type": "code",
  14. "execution_count": null,
  15. "metadata": {
  16. "colab": {
  17. "base_uri": "https://localhost:8080/",
  18. "height": 1000
  19. },
  20. "id": "u6n8Hc1hQSy7",
  21. "outputId": "4b73a4e7-90d9-4be8-d800-7ee57c5d0d6b"
  22. },
  23. "outputs": [
  24. {
  25. "output_type": "stream",
  26. "name": "stdout",
  27. "text": [
  28. "Collecting hazm==0.10.0\n",
  29. " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
  30. "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n",
  31. " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
  32. "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n",
  33. " Downloading flashtext-2.7.tar.gz (14 kB)\n",
  34. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  35. "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n",
  36. " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
  37. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
  38. "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
  39. " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
  40. "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n",
  41. " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
  42. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
  43. "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n",
  44. " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
  45. "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
  46. "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n",
  47. " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
  48. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  49. "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
  50. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
  51. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.5.0)\n",
  52. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
  53. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
  54. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
  55. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
  56. "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
  57. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  58. "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  59. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  60. "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
  61. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  62. "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
  63. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  64. "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
  65. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  66. "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
  67. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  68. "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
  69. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  70. "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
  71. " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  72. " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=78287f302c5e4a8386f41636f5409c9340612d4ade63fc8a79516d6eb9621d2f\n",
  73. " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
  74. "Successfully built flashtext\n",
  75. "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n",
  76. " Attempting uninstall: numpy\n",
  77. " Found existing installation: numpy 2.0.2\n",
  78. " Uninstalling numpy-2.0.2:\n",
  79. " Successfully uninstalled numpy-2.0.2\n",
  80. " Attempting uninstall: scipy\n",
  81. " Found existing installation: scipy 1.15.2\n",
  82. " Uninstalling scipy-1.15.2:\n",
  83. " Successfully uninstalled scipy-1.15.2\n",
  84. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  85. "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
  86. "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
  87. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
  88. "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
  89. "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  90. "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  91. "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
  92. "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
  93. "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
  94. "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  95. "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
  96. ]
  97. },
  98. {
  99. "output_type": "display_data",
  100. "data": {
  101. "application/vnd.colab-display-data+json": {
  102. "pip_warning": {
  103. "packages": [
  104. "numpy"
  105. ]
  106. },
  107. "id": "be4e054b011d4e0e930386438573d963"
  108. }
  109. },
  110. "metadata": {}
  111. }
  112. ],
  113. "source": [
  114. "! pip install hazm==0.10.0"
  115. ]
  116. },
  117. {
  118. "cell_type": "code",
  119. "execution_count": null,
  120. "metadata": {
  121. "colab": {
  122. "base_uri": "https://localhost:8080/"
  123. },
  124. "id": "iA2Jjex-KMqx",
  125. "outputId": "b69e9e85-54f8-49ea-e50c-8f62aca42a65"
  126. },
  127. "outputs": [
  128. {
  129. "output_type": "stream",
  130. "name": "stdout",
  131. "text": [
  132. "Collecting numpy==1.26.0\n",
  133. " Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n",
  134. "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  135. "\u001b[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
  136. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m89.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  137. "\u001b[?25hInstalling collected packages: numpy\n",
  138. " Attempting uninstall: numpy\n",
  139. " Found existing installation: numpy 1.24.3\n",
  140. " Uninstalling numpy-1.24.3:\n",
  141. " Successfully uninstalled numpy-1.24.3\n",
  142. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  143. "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n",
  144. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\n",
  145. "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\u001b[0m\u001b[31m\n",
  146. "\u001b[0mSuccessfully installed numpy-1.26.0\n"
  147. ]
  148. }
  149. ],
  150. "source": [
  151. "!pip install numpy==1.26.0"
  152. ]
  153. },
  154. {
  155. "cell_type": "code",
  156. "execution_count": null,
  157. "metadata": {
  158. "colab": {
  159. "base_uri": "https://localhost:8080/",
  160. "height": 54
  161. },
  162. "id": "QP854R4YHf4I",
  163. "outputId": "5f5efc6a-795d-4b7d-a9ae-6d079a614fac"
  164. },
  165. "outputs": [
  166. {
  167. "output_type": "display_data",
  168. "data": {
  169. "text/plain": [
  170. "<IPython.core.display.HTML object>"
  171. ],
  172. "text/html": [
  173. "\n",
  174. "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
  175. " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
  176. "</div>\n"
  177. ]
  178. },
  179. "metadata": {}
  180. }
  181. ],
  182. "source": [
  183. "from IPython.display import display, HTML\n",
  184. "\n",
  185. "display(HTML(\"\"\"\n",
  186. "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
  187. " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
  188. "</div>\n",
  189. "\"\"\"))"
  190. ]
  191. },
  192. {
  193. "cell_type": "code",
  194. "execution_count": null,
  195. "metadata": {
  196. "id": "EVO9pn8Ou3o1"
  197. },
  198. "outputs": [],
  199. "source": [
  200. "!pip install -q --upgrade --no-cache-dir gdown"
  201. ]
  202. },
  203. {
  204. "cell_type": "code",
  205. "execution_count": null,
  206. "metadata": {
  207. "colab": {
  208. "base_uri": "https://localhost:8080/"
  209. },
  210. "id": "brKU69ZQvEiz",
  211. "outputId": "c6f840f0-01f7-4c1a-cf76-dbc5149f601c"
  212. },
  213. "outputs": [
  214. {
  215. "output_type": "stream",
  216. "name": "stdout",
  217. "text": [
  218. "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/235.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m235.5/235.8 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  219. "\u001b[?25h"
  220. ]
  221. }
  222. ],
  223. "source": [
  224. "!pip install -q unidecode\n",
  225. "!pip install -q transformers"
  226. ]
  227. },
  228. {
  229. "cell_type": "code",
  230. "execution_count": null,
  231. "metadata": {
  232. "colab": {
  233. "base_uri": "https://localhost:8080/"
  234. },
  235. "id": "grp-l-cbGNWm",
  236. "outputId": "fbb8d2e0-8a23-4493-e455-474241c45d6a"
  237. },
  238. "outputs": [
  239. {
  240. "output_type": "stream",
  241. "name": "stdout",
  242. "text": [
  243. "Collecting jiwer\n",
  244. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  245. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  246. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  247. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  248. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  249. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  250. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  251. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  252. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  253. ]
  254. }
  255. ],
  256. "source": [
  257. "!pip install jiwer"
  258. ]
  259. },
  260. {
  261. "cell_type": "code",
  262. "execution_count": null,
  263. "metadata": {
  264. "id": "dQ0osefGGSpJ"
  265. },
  266. "outputs": [],
  267. "source": [
  268. "import pandas as pd\n",
  269. "import re\n",
  270. "from jiwer import cer"
  271. ]
  272. },
  273. {
  274. "cell_type": "markdown",
  275. "metadata": {
  276. "id": "Nwt1YBYVqcva"
  277. },
  278. "source": [
  279. "# Setup Model"
  280. ]
  281. },
  282. {
  283. "cell_type": "code",
  284. "execution_count": null,
  285. "metadata": {
  286. "id": "x-kHFEm8u8Xg"
  287. },
  288. "outputs": [],
  289. "source": [
  290. "!gdown -q 1Or8xx3KX-ZNqt0Ag_FUA2q8TWdxvY6Kr # The Checkpoint\n",
  291. "!unzip -q ge2pe-chpt.zip\n",
  292. "!rm ./ge2pe-chpt.zip"
  293. ]
  294. },
  295. {
  296. "cell_type": "code",
  297. "execution_count": null,
  298. "metadata": {
  299. "id": "CGVVxGpivULm"
  300. },
  301. "outputs": [],
  302. "source": [
  303. "!gdown -q 11Yb0QjyP2R3RvN1oSCX9m0DL2_bzDeZS # Parsivar for normalization\n",
  304. "!unzip -q ./Parsivar.zip\n",
  305. "!rm ./Parsivar.zip"
  306. ]
  307. },
  308. {
  309. "cell_type": "code",
  310. "execution_count": null,
  311. "metadata": {
  312. "colab": {
  313. "base_uri": "https://localhost:8080/"
  314. },
  315. "id": "K-mPQF5ykcmF",
  316. "outputId": "8368d1b8-65b0-47f5-9727-8dd1f1da134b"
  317. },
  318. "outputs": [
  319. {
  320. "output_type": "stream",
  321. "name": "stdout",
  322. "text": [
  323. "Downloading...\n",
  324. "From (original): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34\n",
  325. "From (redirected): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34&confirm=t&uuid=d2dbee7b-e12b-460e-aef6-af169e136cdd\n",
  326. "To: /content/GE2PE.py\n",
  327. "\r 0% 0.00/4.96k [00:00<?, ?B/s]\r100% 4.96k/4.96k [00:00<00:00, 12.8MB/s]\n"
  328. ]
  329. }
  330. ],
  331. "source": [
  332. "! gdown 1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34 # GE2PE.py"
  333. ]
  334. },
  335. {
  336. "cell_type": "code",
  337. "execution_count": null,
  338. "metadata": {
  339. "id": "VIRvJy8naB0f"
  340. },
  341. "outputs": [],
  342. "source": [
  343. "!sed -i 's+from collections import Iterable+from collections.abc import Iterable+g' /content/Parsivar/token_merger.py"
  344. ]
  345. },
  346. {
  347. "cell_type": "code",
  348. "execution_count": null,
  349. "metadata": {
  350. "colab": {
  351. "base_uri": "https://localhost:8080/"
  352. },
  353. "id": "Qs-J5B3ykaYz",
  354. "outputId": "eeb0c2ee-539f-44ee-c469-7eed7edca22b"
  355. },
  356. "outputs": [
  357. {
  358. "output_type": "execute_result",
  359. "data": {
  360. "text/plain": [
  361. "['teste model t/bdil nevise be vaj', '@in ketabe @ali @/st']"
  362. ]
  363. },
  364. "metadata": {},
  365. "execution_count": 9
  366. }
  367. ],
  368. "source": [
  369. "from GE2PE import GE2PE\n",
  370. "\n",
  371. "g2p = GE2PE(model_path='/content/ge2pe-chpt')\n",
  372. "\n",
  373. "g2p.generate(['تست مدل تبدیل نویسه به واج', 'این کتابِ علی است'], use_rules=True)"
  374. ]
  375. },
  376. {
  377. "cell_type": "markdown",
  378. "metadata": {
  379. "id": "XjAPkfq7SF87"
  380. },
  381. "source": [
  382. "# Get Evaluation Data"
  383. ]
  384. },
  385. {
  386. "cell_type": "code",
  387. "execution_count": null,
  388. "metadata": {
  389. "colab": {
  390. "base_uri": "https://localhost:8080/"
  391. },
  392. "id": "qwCG0jX-88nQ",
  393. "outputId": "ffe13c8d-3b96-479f-bf8d-04f031ca412d"
  394. },
  395. "outputs": [
  396. {
  397. "output_type": "stream",
  398. "name": "stdout",
  399. "text": [
  400. "--2025-05-13 06:47:23-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  401. "Resolving huggingface.co (huggingface.co)... 18.164.174.23, 18.164.174.17, 18.164.174.55, ...\n",
  402. "Connecting to huggingface.co (huggingface.co)|18.164.174.23|:443... connected.\n",
  403. "HTTP request sent, awaiting response... 200 OK\n",
  404. "Length: 56026 (55K) [text/plain]\n",
  405. "Saving to: ‘SentenceBench.csv’\n",
  406. "\n",
  407. "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.008s \n",
  408. "\n",
  409. "2025-05-13 06:47:23 (6.99 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  410. "\n"
  411. ]
  412. }
  413. ],
  414. "source": [
  415. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  416. ]
  417. },
  418. {
  419. "cell_type": "code",
  420. "execution_count": null,
  421. "metadata": {
  422. "id": "hJO-UAPDQvcb"
  423. },
  424. "outputs": [],
  425. "source": [
  426. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  427. ]
  428. },
  429. {
  430. "cell_type": "code",
  431. "execution_count": null,
  432. "metadata": {
  433. "colab": {
  434. "base_uri": "https://localhost:8080/",
  435. "height": 143
  436. },
  437. "id": "qlYbrnUa9LAN",
  438. "outputId": "d95b98b2-db8a-4192-a3e3-c8f154a17792"
  439. },
  440. "outputs": [
  441. {
  442. "output_type": "execute_result",
  443. "data": {
  444. "text/plain": [
  445. " dataset grapheme \\\n",
  446. "0 homograph من قدر تو را می‌دانم \n",
  447. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  448. "2 homograph به دست و صورتم کرم زدم \n",
  449. "\n",
  450. " phoneme homograph word \\\n",
  451. "0 man qadr-e to rA mi-dAnam قدر \n",
  452. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  453. "2 be dast-o suratam kerem zadam کرم \n",
  454. "\n",
  455. " pronunciation \n",
  456. "0 qadr \n",
  457. "1 qadar \n",
  458. "2 kerem "
  459. ],
  460. "text/html": [
  461. "\n",
  462. " <div id=\"df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f\" class=\"colab-df-container\">\n",
  463. " <div>\n",
  464. "<style scoped>\n",
  465. " .dataframe tbody tr th:only-of-type {\n",
  466. " vertical-align: middle;\n",
  467. " }\n",
  468. "\n",
  469. " .dataframe tbody tr th {\n",
  470. " vertical-align: top;\n",
  471. " }\n",
  472. "\n",
  473. " .dataframe thead th {\n",
  474. " text-align: right;\n",
  475. " }\n",
  476. "</style>\n",
  477. "<table border=\"1\" class=\"dataframe\">\n",
  478. " <thead>\n",
  479. " <tr style=\"text-align: right;\">\n",
  480. " <th></th>\n",
  481. " <th>dataset</th>\n",
  482. " <th>grapheme</th>\n",
  483. " <th>phoneme</th>\n",
  484. " <th>homograph word</th>\n",
  485. " <th>pronunciation</th>\n",
  486. " </tr>\n",
  487. " </thead>\n",
  488. " <tbody>\n",
  489. " <tr>\n",
  490. " <th>0</th>\n",
  491. " <td>homograph</td>\n",
  492. " <td>من قدر تو را می‌دانم</td>\n",
  493. " <td>man qadr-e to rA mi-dAnam</td>\n",
  494. " <td>قدر</td>\n",
  495. " <td>qadr</td>\n",
  496. " </tr>\n",
  497. " <tr>\n",
  498. " <th>1</th>\n",
  499. " <td>homograph</td>\n",
  500. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  501. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  502. " <td>قدر</td>\n",
  503. " <td>qadar</td>\n",
  504. " </tr>\n",
  505. " <tr>\n",
  506. " <th>2</th>\n",
  507. " <td>homograph</td>\n",
  508. " <td>به دست و صورتم کرم زدم</td>\n",
  509. " <td>be dast-o suratam kerem zadam</td>\n",
  510. " <td>کرم</td>\n",
  511. " <td>kerem</td>\n",
  512. " </tr>\n",
  513. " </tbody>\n",
  514. "</table>\n",
  515. "</div>\n",
  516. " <div class=\"colab-df-buttons\">\n",
  517. "\n",
  518. " <div class=\"colab-df-container\">\n",
  519. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f')\"\n",
  520. " title=\"Convert this dataframe to an interactive table.\"\n",
  521. " style=\"display:none;\">\n",
  522. "\n",
  523. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  524. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  525. " </svg>\n",
  526. " </button>\n",
  527. "\n",
  528. " <style>\n",
  529. " .colab-df-container {\n",
  530. " display:flex;\n",
  531. " gap: 12px;\n",
  532. " }\n",
  533. "\n",
  534. " .colab-df-convert {\n",
  535. " background-color: #E8F0FE;\n",
  536. " border: none;\n",
  537. " border-radius: 50%;\n",
  538. " cursor: pointer;\n",
  539. " display: none;\n",
  540. " fill: #1967D2;\n",
  541. " height: 32px;\n",
  542. " padding: 0 0 0 0;\n",
  543. " width: 32px;\n",
  544. " }\n",
  545. "\n",
  546. " .colab-df-convert:hover {\n",
  547. " background-color: #E2EBFA;\n",
  548. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  549. " fill: #174EA6;\n",
  550. " }\n",
  551. "\n",
  552. " .colab-df-buttons div {\n",
  553. " margin-bottom: 4px;\n",
  554. " }\n",
  555. "\n",
  556. " [theme=dark] .colab-df-convert {\n",
  557. " background-color: #3B4455;\n",
  558. " fill: #D2E3FC;\n",
  559. " }\n",
  560. "\n",
  561. " [theme=dark] .colab-df-convert:hover {\n",
  562. " background-color: #434B5C;\n",
  563. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  564. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  565. " fill: #FFFFFF;\n",
  566. " }\n",
  567. " </style>\n",
  568. "\n",
  569. " <script>\n",
  570. " const buttonEl =\n",
  571. " document.querySelector('#df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f button.colab-df-convert');\n",
  572. " buttonEl.style.display =\n",
  573. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  574. "\n",
  575. " async function convertToInteractive(key) {\n",
  576. " const element = document.querySelector('#df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f');\n",
  577. " const dataTable =\n",
  578. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  579. " [key], {});\n",
  580. " if (!dataTable) return;\n",
  581. "\n",
  582. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  583. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  584. " + ' to learn more about interactive tables.';\n",
  585. " element.innerHTML = '';\n",
  586. " dataTable['output_type'] = 'display_data';\n",
  587. " await google.colab.output.renderOutput(dataTable, element);\n",
  588. " const docLink = document.createElement('div');\n",
  589. " docLink.innerHTML = docLinkHtml;\n",
  590. " element.appendChild(docLink);\n",
  591. " }\n",
  592. " </script>\n",
  593. " </div>\n",
  594. "\n",
  595. "\n",
  596. " <div id=\"df-dea3061f-6364-43a5-8c24-98a030eec895\">\n",
  597. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-dea3061f-6364-43a5-8c24-98a030eec895')\"\n",
  598. " title=\"Suggest charts\"\n",
  599. " style=\"display:none;\">\n",
  600. "\n",
  601. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  602. " width=\"24px\">\n",
  603. " <g>\n",
  604. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  605. " </g>\n",
  606. "</svg>\n",
  607. " </button>\n",
  608. "\n",
  609. "<style>\n",
  610. " .colab-df-quickchart {\n",
  611. " --bg-color: #E8F0FE;\n",
  612. " --fill-color: #1967D2;\n",
  613. " --hover-bg-color: #E2EBFA;\n",
  614. " --hover-fill-color: #174EA6;\n",
  615. " --disabled-fill-color: #AAA;\n",
  616. " --disabled-bg-color: #DDD;\n",
  617. " }\n",
  618. "\n",
  619. " [theme=dark] .colab-df-quickchart {\n",
  620. " --bg-color: #3B4455;\n",
  621. " --fill-color: #D2E3FC;\n",
  622. " --hover-bg-color: #434B5C;\n",
  623. " --hover-fill-color: #FFFFFF;\n",
  624. " --disabled-bg-color: #3B4455;\n",
  625. " --disabled-fill-color: #666;\n",
  626. " }\n",
  627. "\n",
  628. " .colab-df-quickchart {\n",
  629. " background-color: var(--bg-color);\n",
  630. " border: none;\n",
  631. " border-radius: 50%;\n",
  632. " cursor: pointer;\n",
  633. " display: none;\n",
  634. " fill: var(--fill-color);\n",
  635. " height: 32px;\n",
  636. " padding: 0;\n",
  637. " width: 32px;\n",
  638. " }\n",
  639. "\n",
  640. " .colab-df-quickchart:hover {\n",
  641. " background-color: var(--hover-bg-color);\n",
  642. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  643. " fill: var(--button-hover-fill-color);\n",
  644. " }\n",
  645. "\n",
  646. " .colab-df-quickchart-complete:disabled,\n",
  647. " .colab-df-quickchart-complete:disabled:hover {\n",
  648. " background-color: var(--disabled-bg-color);\n",
  649. " fill: var(--disabled-fill-color);\n",
  650. " box-shadow: none;\n",
  651. " }\n",
  652. "\n",
  653. " .colab-df-spinner {\n",
  654. " border: 2px solid var(--fill-color);\n",
  655. " border-color: transparent;\n",
  656. " border-bottom-color: var(--fill-color);\n",
  657. " animation:\n",
  658. " spin 1s steps(1) infinite;\n",
  659. " }\n",
  660. "\n",
  661. " @keyframes spin {\n",
  662. " 0% {\n",
  663. " border-color: transparent;\n",
  664. " border-bottom-color: var(--fill-color);\n",
  665. " border-left-color: var(--fill-color);\n",
  666. " }\n",
  667. " 20% {\n",
  668. " border-color: transparent;\n",
  669. " border-left-color: var(--fill-color);\n",
  670. " border-top-color: var(--fill-color);\n",
  671. " }\n",
  672. " 30% {\n",
  673. " border-color: transparent;\n",
  674. " border-left-color: var(--fill-color);\n",
  675. " border-top-color: var(--fill-color);\n",
  676. " border-right-color: var(--fill-color);\n",
  677. " }\n",
  678. " 40% {\n",
  679. " border-color: transparent;\n",
  680. " border-right-color: var(--fill-color);\n",
  681. " border-top-color: var(--fill-color);\n",
  682. " }\n",
  683. " 60% {\n",
  684. " border-color: transparent;\n",
  685. " border-right-color: var(--fill-color);\n",
  686. " }\n",
  687. " 80% {\n",
  688. " border-color: transparent;\n",
  689. " border-right-color: var(--fill-color);\n",
  690. " border-bottom-color: var(--fill-color);\n",
  691. " }\n",
  692. " 90% {\n",
  693. " border-color: transparent;\n",
  694. " border-bottom-color: var(--fill-color);\n",
  695. " }\n",
  696. " }\n",
  697. "</style>\n",
  698. "\n",
  699. " <script>\n",
  700. " async function quickchart(key) {\n",
  701. " const quickchartButtonEl =\n",
  702. " document.querySelector('#' + key + ' button');\n",
  703. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  704. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  705. " try {\n",
  706. " const charts = await google.colab.kernel.invokeFunction(\n",
  707. " 'suggestCharts', [key], {});\n",
  708. " } catch (error) {\n",
  709. " console.error('Error during call to suggestCharts:', error);\n",
  710. " }\n",
  711. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  712. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  713. " }\n",
  714. " (() => {\n",
  715. " let quickchartButtonEl =\n",
  716. " document.querySelector('#df-dea3061f-6364-43a5-8c24-98a030eec895 button');\n",
  717. " quickchartButtonEl.style.display =\n",
  718. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  719. " })();\n",
  720. " </script>\n",
  721. " </div>\n",
  722. " </div>\n",
  723. " </div>\n"
  724. ],
  725. "application/vnd.google.colaboratory.intrinsic+json": {
  726. "type": "dataframe",
  727. "variable_name": "sentence_bench",
  728. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  729. }
  730. },
  731. "metadata": {},
  732. "execution_count": 12
  733. }
  734. ],
  735. "source": [
  736. "sentence_bench.head(3)"
  737. ]
  738. },
  739. {
  740. "cell_type": "markdown",
  741. "metadata": {
  742. "id": "wDV7ysXf2b_H"
  743. },
  744. "source": [
  745. "### Get ManaTTS"
  746. ]
  747. },
  748. {
  749. "cell_type": "code",
  750. "execution_count": null,
  751. "metadata": {
  752. "colab": {
  753. "base_uri": "https://localhost:8080/"
  754. },
  755. "id": "TcL5ZLvSSnVB",
  756. "outputId": "f4989c23-9afd-4aff-8346-e2b3f8838bd4"
  757. },
  758. "outputs": [
  759. {
  760. "output_type": "execute_result",
  761. "data": {
  762. "text/plain": [
  763. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  764. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  765. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  766. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  767. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  768. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  769. ]
  770. },
  771. "metadata": {},
  772. "execution_count": 13
  773. }
  774. ],
  775. "source": [
  776. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  777. "\n",
  778. "# Convert to a list of tuples\n",
  779. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  780. "\n",
  781. "mana_evaluation_data[:3]"
  782. ]
  783. },
  784. {
  785. "cell_type": "markdown",
  786. "metadata": {
  787. "id": "Jjacw9Mp2eoX"
  788. },
  789. "source": [
  790. "### Get CommonVoice"
  791. ]
  792. },
  793. {
  794. "cell_type": "code",
  795. "execution_count": null,
  796. "metadata": {
  797. "colab": {
  798. "base_uri": "https://localhost:8080/"
  799. },
  800. "id": "-yQnqCGw26sk",
  801. "outputId": "afd35025-a4d8-4331-ad71-5637f5fd8191"
  802. },
  803. "outputs": [
  804. {
  805. "output_type": "execute_result",
  806. "data": {
  807. "text/plain": [
  808. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  809. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  810. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  811. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  812. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  813. ]
  814. },
  815. "metadata": {},
  816. "execution_count": 14
  817. }
  818. ],
  819. "source": [
  820. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  821. "\n",
  822. "# Convert to a list of tuples\n",
  823. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  824. "\n",
  825. "commonvoice_evaluation_data[:3]"
  826. ]
  827. },
  828. {
  829. "cell_type": "markdown",
  830. "metadata": {
  831. "id": "ciSPyhRc3Rvo"
  832. },
  833. "source": [
  834. "### Get Homograph"
  835. ]
  836. },
  837. {
  838. "cell_type": "code",
  839. "execution_count": null,
  840. "metadata": {
  841. "colab": {
  842. "base_uri": "https://localhost:8080/"
  843. },
  844. "id": "XlFc5JbN3Rvz",
  845. "outputId": "fbd4182f-2446-47ed-dbfb-c7ca9128320e"
  846. },
  847. "outputs": [
  848. {
  849. "output_type": "execute_result",
  850. "data": {
  851. "text/plain": [
  852. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  853. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  854. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  855. " 'قدر',\n",
  856. " 'qadar'),\n",
  857. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  858. ]
  859. },
  860. "metadata": {},
  861. "execution_count": 15
  862. }
  863. ],
  864. "source": [
  865. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  866. "\n",
  867. "# Convert to a list of tuples\n",
  868. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  869. "\n",
  870. "homograph_evaluation_data[:3]"
  871. ]
  872. },
  873. {
  874. "cell_type": "markdown",
  875. "metadata": {
  876. "id": "R6PE5ds45TPr"
  877. },
  878. "source": [
  879. "# Evaluate Method Outputs"
  880. ]
  881. },
  882. {
  883. "cell_type": "markdown",
  884. "metadata": {
  885. "id": "y73zFlRGIbt9"
  886. },
  887. "source": [
  888. "## PER Evaluation"
  889. ]
  890. },
  891. {
  892. "cell_type": "code",
  893. "execution_count": null,
  894. "metadata": {
  895. "id": "ItuviO3w5Vzv"
  896. },
  897. "outputs": [],
  898. "source": [
  899. "def remove_non_word_chars(text):\n",
  900. " pattern = r'[^\\w\\s\\?]'\n",
  901. " cleaned_text = re.sub(pattern, ' ', text)\n",
  902. " return cleaned_text"
  903. ]
  904. },
  905. {
  906. "cell_type": "code",
  907. "execution_count": null,
  908. "metadata": {
  909. "id": "syQCurXu51TO"
  910. },
  911. "outputs": [],
  912. "source": [
  913. "def remove_white_spaces(text):\n",
  914. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  915. " return cleaned_text.strip()"
  916. ]
  917. },
  918. {
  919. "cell_type": "code",
  920. "execution_count": null,
  921. "metadata": {
  922. "id": "V7APkVM053RP"
  923. },
  924. "outputs": [],
  925. "source": [
  926. "def get_word_only_text(text):\n",
  927. " word_only_text = remove_non_word_chars(text)\n",
  928. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  929. "\n",
  930. " return extra_space_removed_text"
  931. ]
  932. },
  933. {
  934. "cell_type": "code",
  935. "execution_count": null,
  936. "metadata": {
  937. "id": "ROomKSao57vy"
  938. },
  939. "outputs": [],
  940. "source": [
  941. "def get_texts_cer(reference, model_output):\n",
  942. " # Preprocess input texts to only contain word characters\n",
  943. " word_only_reference = get_word_only_text(reference)\n",
  944. " word_only_output = get_word_only_text(model_output)\n",
  945. "\n",
  946. " # Return +infinity for CER if any of the texts is empty\n",
  947. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  948. " return float('inf')\n",
  949. "\n",
  950. " return cer(word_only_reference, word_only_output)"
  951. ]
  952. },
  953. {
  954. "cell_type": "code",
  955. "execution_count": null,
  956. "metadata": {
  957. "id": "4vHLUjp48hc3"
  958. },
  959. "outputs": [],
  960. "source": [
  961. "def get_avg_cer_of_method(method_outputs, references):\n",
  962. " cers = []\n",
  963. " for idx, o in enumerate(method_outputs):\n",
  964. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  965. " if cer != float('inf'):\n",
  966. " cers.append(cer)\n",
  967. "\n",
  968. " return sum(cers) / len(cers)"
  969. ]
  970. },
  971. {
  972. "cell_type": "markdown",
  973. "metadata": {
  974. "id": "oBgNtpFQDwku"
  975. },
  976. "source": [
  977. "## Homograph Evaluation"
  978. ]
  979. },
  980. {
  981. "cell_type": "code",
  982. "execution_count": null,
  983. "metadata": {
  984. "id": "J445ULEvEEDn"
  985. },
  986. "outputs": [],
  987. "source": [
  988. "def get_homograph_performance(outputs, references):\n",
  989. " corrects = 0\n",
  990. " total = 0\n",
  991. "\n",
  992. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  993. " if homograph != '':\n",
  994. " total += 1\n",
  995. " if right in outputs[idx]:\n",
  996. " corrects += 1\n",
  997. "\n",
  998. " return corrects / total"
  999. ]
  1000. },
  1001. {
  1002. "cell_type": "markdown",
  1003. "metadata": {
  1004. "id": "JGEUIrbi9kNH"
  1005. },
  1006. "source": [
  1007. "# Full bench"
  1008. ]
  1009. },
  1010. {
  1011. "cell_type": "code",
  1012. "execution_count": null,
  1013. "metadata": {
  1014. "id": "fGzQvL8V9mln"
  1015. },
  1016. "outputs": [],
  1017. "source": [
  1018. "benchmark = []\n",
  1019. "\n",
  1020. "for g, p in mana_evaluation_data:\n",
  1021. " benchmark.append((g, p, '', ''))\n",
  1022. "\n",
  1023. "for g, p in commonvoice_evaluation_data:\n",
  1024. " benchmark.append((g, p, '', ''))\n",
  1025. "\n",
  1026. "for g, p, w, r in homograph_evaluation_data:\n",
  1027. " benchmark.append((g, p, w, r))\n",
  1028. "\n",
  1029. "benchmark = benchmark[:400]"
  1030. ]
  1031. },
  1032. {
  1033. "cell_type": "code",
  1034. "execution_count": null,
  1035. "metadata": {
  1036. "id": "4jlXFt8tCPWB"
  1037. },
  1038. "outputs": [],
  1039. "source": [
  1040. "def print_all_metrics(predictions):\n",
  1041. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1042. " # acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)\n",
  1043. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1044. "\n",
  1045. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1046. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1047. ]
  1048. },
  1049. {
  1050. "cell_type": "markdown",
  1051. "metadata": {
  1052. "id": "fTRgGM_8_Fwg"
  1053. },
  1054. "source": [
  1055. "# Inference"
  1056. ]
  1057. },
  1058. {
  1059. "cell_type": "code",
  1060. "execution_count": null,
  1061. "metadata": {
  1062. "id": "17lrgWh__Mzr"
  1063. },
  1064. "outputs": [],
  1065. "source": [
  1066. "graphemes = [item[0] for item in benchmark]"
  1067. ]
  1068. },
  1069. {
  1070. "cell_type": "code",
  1071. "execution_count": null,
  1072. "metadata": {
  1073. "id": "ajqTWtNb_HBd"
  1074. },
  1075. "outputs": [],
  1076. "source": [
  1077. "import time\n",
  1078. "\n",
  1079. "start_time = time.time()\n",
  1080. "\n",
  1081. "outputs = g2p.generate(graphemes, use_rules=True)\n",
  1082. "\n",
  1083. "total_time = time.time() - start_time\n",
  1084. "avg_time = total_time / len(graphemes) if len(graphemes) > 0 else 0"
  1085. ]
  1086. },
  1087. {
  1088. "cell_type": "markdown",
  1089. "metadata": {
  1090. "id": "jPXWBZ4R_bGs"
  1091. },
  1092. "source": [
  1093. "# Mapping"
  1094. ]
  1095. },
  1096. {
  1097. "cell_type": "code",
  1098. "execution_count": null,
  1099. "metadata": {
  1100. "id": "c8C2sJjJA4na"
  1101. },
  1102. "outputs": [],
  1103. "source": [
  1104. "mapped_outputs = []\n",
  1105. "\n",
  1106. "# Define the replacements\n",
  1107. "replacements = {\n",
  1108. " 'a': 'A',\n",
  1109. " '$': 'S',\n",
  1110. " '/': 'a',\n",
  1111. " '1': '',\n",
  1112. " ';': 'Z',\n",
  1113. " '@': '?',\n",
  1114. " 'c': 'C'\n",
  1115. "}\n",
  1116. "\n",
  1117. "# Apply replacements\n",
  1118. "mapped_outputs = [\n",
  1119. " ''.join(replacements.get(char, char) for char in output)\n",
  1120. " for output in outputs\n",
  1121. "]"
  1122. ]
  1123. },
  1124. {
  1125. "cell_type": "markdown",
  1126. "metadata": {
  1127. "id": "JAIAobLFCKCr"
  1128. },
  1129. "source": [
  1130. "# Results"
  1131. ]
  1132. },
  1133. {
  1134. "cell_type": "code",
  1135. "execution_count": null,
  1136. "metadata": {
  1137. "colab": {
  1138. "base_uri": "https://localhost:8080/"
  1139. },
  1140. "id": "CEs_TODaAFHO",
  1141. "outputId": "48aa9c83-811c-4eb6-a829-ef1fff0ea36b"
  1142. },
  1143. "outputs": [
  1144. {
  1145. "output_type": "stream",
  1146. "name": "stdout",
  1147. "text": [
  1148. "PER: \t\t\t3.9804\n",
  1149. "HOMOGRAPH: \t\t76.8868\n",
  1150. "TOTAL TIME:\t\t174.7042 (s)\n",
  1151. "AVG TIME:\t\t0.4368 (s)+\n"
  1152. ]
  1153. }
  1154. ],
  1155. "source": [
  1156. "print_all_metrics(mapped_outputs)\n",
  1157. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1158. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)+\")"
  1159. ]
  1160. },
  1161. {
  1162. "cell_type": "markdown",
  1163. "metadata": {
  1164. "id": "DeOaBaWEJI6x"
  1165. },
  1166. "source": [
  1167. "# Runs\n",
  1168. "\n",
  1169. "## First:\n",
  1170. "\n",
  1171. "```\n",
  1172. "PER: \t\t\t3.9804\n",
  1173. "HOMOGRAPH: \t\t76.8868\n",
  1174. "TOTAL TIME:\t\t182.5777 (s)\n",
  1175. "AVG TIME:\t\t0.4564 (s)+\n",
  1176. "```\n",
  1177. "\n",
  1178. "## Second\n",
  1179. "\n",
  1180. "```\n",
  1181. "PER: \t\t\t3.9804\n",
  1182. "HOMOGRAPH: \t\t76.8868\n",
  1183. "TOTAL TIME:\t\t191.1550 (s)\n",
  1184. "AVG TIME:\t\t0.4779 (s)+\n",
  1185. "```\n",
  1186. "\n",
  1187. "## Third\n",
  1188. "\n",
  1189. "```\n",
  1190. "PER: \t\t\t3.9804\n",
  1191. "HOMOGRAPH: \t\t76.8868\n",
  1192. "TOTAL TIME:\t\t173.8426 (s)\n",
  1193. "AVG TIME:\t\t0.4346 (s)+\n",
  1194. "```\n",
  1195. "\n",
  1196. "## Fourth\n",
  1197. "\n",
  1198. "```\n",
  1199. "PER: \t\t\t3.9804\n",
  1200. "HOMOGRAPH: \t\t76.8868\n",
  1201. "TOTAL TIME:\t\t172.3748 (s)\n",
  1202. "AVG TIME:\t\t0.4309 (s)+\n",
  1203. "```\n",
  1204. "\n",
  1205. "## Fifth\n",
  1206. "\n",
  1207. "```\n",
  1208. "PER: \t\t\t3.9804\n",
  1209. "HOMOGRAPH: \t\t76.8868\n",
  1210. "TOTAL TIME:\t\t174.7042 (s)\n",
  1211. "AVG TIME:\t\t0.4368 (s)+\n",
  1212. "```"
  1213. ]
  1214. }
  1215. ],
  1216. "metadata": {
  1217. "colab": {
  1218. "collapsed_sections": [
  1219. "AdU8VMTIOWLZ",
  1220. "a3zuvbqx2l68",
  1221. "XjAPkfq7SF87",
  1222. "R6PE5ds45TPr",
  1223. "y73zFlRGIbt9",
  1224. "oBgNtpFQDwku",
  1225. "JGEUIrbi9kNH",
  1226. "fTRgGM_8_Fwg",
  1227. "jPXWBZ4R_bGs"
  1228. ],
  1229. "provenance": []
  1230. },
  1231. "gpuClass": "standard",
  1232. "kernelspec": {
  1233. "display_name": "Python 3",
  1234. "name": "python3"
  1235. },
  1236. "language_info": {
  1237. "name": "python"
  1238. }
  1239. },
  1240. "nbformat": 4,
  1241. "nbformat_minor": 0
  1242. }