Benchmarking notebooks for various Persian G2P models, comparing their performance on the SentenceBench dataset, including Homo-GE2PE and Homo-T5.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Benchmark_GE2PE.ipynb 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": [],
  7. "collapsed_sections": [
  8. "AdU8VMTIOWLZ",
  9. "a3zuvbqx2l68",
  10. "XjAPkfq7SF87",
  11. "R6PE5ds45TPr",
  12. "y73zFlRGIbt9",
  13. "oBgNtpFQDwku",
  14. "JGEUIrbi9kNH",
  15. "fTRgGM_8_Fwg",
  16. "jPXWBZ4R_bGs"
  17. ]
  18. },
  19. "kernelspec": {
  20. "name": "python3",
  21. "display_name": "Python 3"
  22. },
  23. "language_info": {
  24. "name": "python"
  25. },
  26. "gpuClass": "standard"
  27. },
  28. "cells": [
  29. {
  30. "cell_type": "markdown",
  31. "source": [
  32. "# Setup Environment"
  33. ],
  34. "metadata": {
  35. "id": "9sEfZoepGP8x"
  36. }
  37. },
  38. {
  39. "cell_type": "code",
  40. "source": [
  41. "! pip install hazm==0.10.0"
  42. ],
  43. "metadata": {
  44. "colab": {
  45. "base_uri": "https://localhost:8080/",
  46. "height": 770
  47. },
  48. "id": "u6n8Hc1hQSy7",
  49. "outputId": "e5448572-c76c-4336-97e0-4e931a1c3940"
  50. },
  51. "execution_count": null,
  52. "outputs": [
  53. {
  54. "output_type": "stream",
  55. "name": "stdout",
  56. "text": [
  57. "Requirement already satisfied: hazm==0.10.0 in /usr/local/lib/python3.11/dist-packages (0.10.0)\n",
  58. "Requirement already satisfied: fasttext-wheel<0.10.0,>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (0.9.2)\n",
  59. "Requirement already satisfied: flashtext<3.0,>=2.7 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (2.7)\n",
  60. "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (4.3.3)\n",
  61. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
  62. "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
  63. " Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
  64. "Requirement already satisfied: python-crfsuite<0.10.0,>=0.9.9 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (0.9.11)\n",
  65. "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
  66. "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (2.13.6)\n",
  67. "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
  68. "Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.13.1)\n",
  69. "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
  70. "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
  71. "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.4.2)\n",
  72. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
  73. "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
  74. "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
  75. "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
  76. "Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  77. "Installing collected packages: numpy\n",
  78. " Attempting uninstall: numpy\n",
  79. " Found existing installation: numpy 1.26.0\n",
  80. " Uninstalling numpy-1.26.0:\n",
  81. " Successfully uninstalled numpy-1.26.0\n",
  82. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  83. "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
  84. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
  85. "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
  86. "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
  87. "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  88. "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
  89. "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
  90. "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
  91. "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  92. "\u001b[0mSuccessfully installed numpy-1.24.3\n"
  93. ]
  94. },
  95. {
  96. "output_type": "display_data",
  97. "data": {
  98. "application/vnd.colab-display-data+json": {
  99. "pip_warning": {
  100. "packages": [
  101. "numpy"
  102. ]
  103. },
  104. "id": "02f4ece44a3543ebb22ad3f3301874b3"
  105. }
  106. },
  107. "metadata": {}
  108. }
  109. ]
  110. },
  111. {
  112. "cell_type": "code",
  113. "source": [
  114. "!pip install numpy==1.26.0"
  115. ],
  116. "metadata": {
  117. "colab": {
  118. "base_uri": "https://localhost:8080/"
  119. },
  120. "id": "iA2Jjex-KMqx",
  121. "outputId": "521918bf-2909-4310-c2f8-5774c16a6215"
  122. },
  123. "execution_count": null,
  124. "outputs": [
  125. {
  126. "output_type": "stream",
  127. "name": "stdout",
  128. "text": [
  129. "Collecting numpy==1.26.0\n",
  130. " Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n",
  131. "Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
  132. "Installing collected packages: numpy\n",
  133. " Attempting uninstall: numpy\n",
  134. " Found existing installation: numpy 1.24.3\n",
  135. " Uninstalling numpy-1.24.3:\n",
  136. " Successfully uninstalled numpy-1.24.3\n",
  137. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  138. "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n",
  139. "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\u001b[0m\u001b[31m\n",
  140. "\u001b[0mSuccessfully installed numpy-1.26.0\n"
  141. ]
  142. }
  143. ]
  144. },
  145. {
  146. "cell_type": "code",
  147. "source": [
  148. "from IPython.display import display, HTML\n",
  149. "\n",
  150. "display(HTML(\"\"\"\n",
  151. "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
  152. " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
  153. "</div>\n",
  154. "\"\"\"))"
  155. ],
  156. "metadata": {
  157. "colab": {
  158. "base_uri": "https://localhost:8080/",
  159. "height": 54
  160. },
  161. "id": "QP854R4YHf4I",
  162. "outputId": "5bd9bf27-9215-48da-ee73-24cfe9162e69"
  163. },
  164. "execution_count": null,
  165. "outputs": [
  166. {
  167. "output_type": "display_data",
  168. "data": {
  169. "text/plain": [
  170. "<IPython.core.display.HTML object>"
  171. ],
  172. "text/html": [
  173. "\n",
  174. "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
  175. " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
  176. "</div>\n"
  177. ]
  178. },
  179. "metadata": {}
  180. }
  181. ]
  182. },
  183. {
  184. "cell_type": "code",
  185. "source": [
  186. "!pip install -q --upgrade --no-cache-dir gdown"
  187. ],
  188. "metadata": {
  189. "id": "EVO9pn8Ou3o1"
  190. },
  191. "execution_count": null,
  192. "outputs": []
  193. },
  194. {
  195. "cell_type": "code",
  196. "source": [
  197. "!pip install -q unidecode\n",
  198. "!pip install -q transformers"
  199. ],
  200. "metadata": {
  201. "colab": {
  202. "base_uri": "https://localhost:8080/"
  203. },
  204. "id": "brKU69ZQvEiz",
  205. "outputId": "b4a6bc53-ddc0-4660-909a-5108aa4f7db0"
  206. },
  207. "execution_count": null,
  208. "outputs": [
  209. {
  210. "output_type": "stream",
  211. "name": "stdout",
  212. "text": [
  213. "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/235.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.4/235.8 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  214. "\u001b[?25h"
  215. ]
  216. }
  217. ]
  218. },
  219. {
  220. "cell_type": "code",
  221. "source": [
  222. "!pip install jiwer"
  223. ],
  224. "metadata": {
  225. "colab": {
  226. "base_uri": "https://localhost:8080/"
  227. },
  228. "id": "grp-l-cbGNWm",
  229. "outputId": "58c70285-e3ce-4c56-cb38-3cf7e07d26d1"
  230. },
  231. "execution_count": null,
  232. "outputs": [
  233. {
  234. "output_type": "stream",
  235. "name": "stdout",
  236. "text": [
  237. "Collecting jiwer\n",
  238. " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
  239. "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
  240. "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
  241. " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
  242. "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
  243. "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
  244. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m33.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  245. "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
  246. "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
  247. ]
  248. }
  249. ]
  250. },
  251. {
  252. "cell_type": "code",
  253. "source": [
  254. "import pandas as pd\n",
  255. "import re\n",
  256. "from jiwer import cer"
  257. ],
  258. "metadata": {
  259. "id": "dQ0osefGGSpJ"
  260. },
  261. "execution_count": null,
  262. "outputs": []
  263. },
  264. {
  265. "cell_type": "markdown",
  266. "source": [
  267. "# Setup Model"
  268. ],
  269. "metadata": {
  270. "id": "Nwt1YBYVqcva"
  271. }
  272. },
  273. {
  274. "cell_type": "code",
  275. "source": [
  276. "!gdown -q 1CrCX8SNhMcmi3KogffFaS4pSaC0t73nJ # The Checkpoint\n",
  277. "!unzip -q ./checkpoint-320.zip\n",
  278. "!rm ./checkpoint-320.zip"
  279. ],
  280. "metadata": {
  281. "id": "x-kHFEm8u8Xg"
  282. },
  283. "execution_count": null,
  284. "outputs": []
  285. },
  286. {
  287. "cell_type": "code",
  288. "source": [
  289. "!gdown -q 11Yb0QjyP2R3RvN1oSCX9m0DL2_bzDeZS # Parsivar for normalization\n",
  290. "!unzip -q ./Parsivar.zip\n",
  291. "!rm ./Parsivar.zip"
  292. ],
  293. "metadata": {
  294. "id": "CGVVxGpivULm"
  295. },
  296. "execution_count": null,
  297. "outputs": []
  298. },
  299. {
  300. "cell_type": "code",
  301. "source": [
  302. "! gdown 1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34 # GE2PE.py"
  303. ],
  304. "metadata": {
  305. "colab": {
  306. "base_uri": "https://localhost:8080/"
  307. },
  308. "id": "K-mPQF5ykcmF",
  309. "outputId": "3de14d12-0746-4114-e8ae-c9772e377f31"
  310. },
  311. "execution_count": null,
  312. "outputs": [
  313. {
  314. "output_type": "stream",
  315. "name": "stdout",
  316. "text": [
  317. "Downloading...\n",
  318. "From (original): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34\n",
  319. "From (redirected): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34&confirm=t&uuid=868f5ff1-a03c-4810-aa3c-acf1dda6dbaf\n",
  320. "To: /content/GE2PE.py\n",
  321. "\r 0% 0.00/4.96k [00:00<?, ?B/s]\r100% 4.96k/4.96k [00:00<00:00, 13.9MB/s]\n"
  322. ]
  323. }
  324. ]
  325. },
  326. {
  327. "cell_type": "code",
  328. "source": [
  329. "!sed -i 's+from collections import Iterable+from collections.abc import Iterable+g' /content/Parsivar/token_merger.py"
  330. ],
  331. "metadata": {
  332. "id": "VIRvJy8naB0f"
  333. },
  334. "execution_count": null,
  335. "outputs": []
  336. },
  337. {
  338. "cell_type": "code",
  339. "source": [
  340. "from GE2PE import GE2PE\n",
  341. "\n",
  342. "g2p = GE2PE()\n",
  343. "\n",
  344. "g2p.generate(['تست مدل تبدیل نویسه به واج', 'این کتابِ علی است'], use_rules=True)"
  345. ],
  346. "metadata": {
  347. "colab": {
  348. "base_uri": "https://localhost:8080/"
  349. },
  350. "id": "Qs-J5B3ykaYz",
  351. "outputId": "bade658e-452b-4844-ff0e-5c505047d359"
  352. },
  353. "execution_count": null,
  354. "outputs": [
  355. {
  356. "output_type": "execute_result",
  357. "data": {
  358. "text/plain": [
  359. "['teste modele t/bdile nevise be vaj', '@in ketabe @/li @/st']"
  360. ]
  361. },
  362. "metadata": {},
  363. "execution_count": 9
  364. }
  365. ]
  366. },
  367. {
  368. "cell_type": "markdown",
  369. "metadata": {
  370. "id": "XjAPkfq7SF87"
  371. },
  372. "source": [
  373. "# Get Evaluation Data"
  374. ]
  375. },
  376. {
  377. "cell_type": "code",
  378. "source": [
  379. "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
  380. ],
  381. "metadata": {
  382. "id": "qwCG0jX-88nQ",
  383. "colab": {
  384. "base_uri": "https://localhost:8080/"
  385. },
  386. "outputId": "103a73f8-d4f8-4b30-e103-60a1cda125bf"
  387. },
  388. "execution_count": null,
  389. "outputs": [
  390. {
  391. "output_type": "stream",
  392. "name": "stdout",
  393. "text": [
  394. "--2025-05-10 15:31:20-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
  395. "Resolving huggingface.co (huggingface.co)... 3.169.137.5, 3.169.137.119, 3.169.137.19, ...\n",
  396. "Connecting to huggingface.co (huggingface.co)|3.169.137.5|:443... connected.\n",
  397. "HTTP request sent, awaiting response... 200 OK\n",
  398. "Length: 56026 (55K) [text/plain]\n",
  399. "Saving to: ‘SentenceBench.csv’\n",
  400. "\n",
  401. "SentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.03s \n",
  402. "\n",
  403. "2025-05-10 15:31:20 (1.77 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
  404. "\n"
  405. ]
  406. }
  407. ]
  408. },
  409. {
  410. "cell_type": "code",
  411. "source": [
  412. "sentence_bench = pd.read_csv('SentenceBench.csv')"
  413. ],
  414. "metadata": {
  415. "id": "hJO-UAPDQvcb"
  416. },
  417. "execution_count": null,
  418. "outputs": []
  419. },
  420. {
  421. "cell_type": "code",
  422. "source": [
  423. "sentence_bench.head(3)"
  424. ],
  425. "metadata": {
  426. "colab": {
  427. "base_uri": "https://localhost:8080/"
  428. },
  429. "id": "qlYbrnUa9LAN",
  430. "outputId": "d31f0d21-7f88-48b1-daf8-aeac0b15efa5"
  431. },
  432. "execution_count": null,
  433. "outputs": [
  434. {
  435. "output_type": "execute_result",
  436. "data": {
  437. "text/plain": [
  438. " dataset grapheme \\\n",
  439. "0 homograph من قدر تو را می‌دانم \n",
  440. "1 homograph از قضای الهی به قدر الهی پناه می‌برم \n",
  441. "2 homograph به دست و صورتم کرم زدم \n",
  442. "\n",
  443. " phoneme homograph word \\\n",
  444. "0 man qadr-e to rA mi-dAnam قدر \n",
  445. "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
  446. "2 be dast-o suratam kerem zadam کرم \n",
  447. "\n",
  448. " pronunciation \n",
  449. "0 qadr \n",
  450. "1 qadar \n",
  451. "2 kerem "
  452. ],
  453. "text/html": [
  454. "\n",
  455. " <div id=\"df-672a1961-0329-4c15-a7b4-ad09056c0997\" class=\"colab-df-container\">\n",
  456. " <div>\n",
  457. "<style scoped>\n",
  458. " .dataframe tbody tr th:only-of-type {\n",
  459. " vertical-align: middle;\n",
  460. " }\n",
  461. "\n",
  462. " .dataframe tbody tr th {\n",
  463. " vertical-align: top;\n",
  464. " }\n",
  465. "\n",
  466. " .dataframe thead th {\n",
  467. " text-align: right;\n",
  468. " }\n",
  469. "</style>\n",
  470. "<table border=\"1\" class=\"dataframe\">\n",
  471. " <thead>\n",
  472. " <tr style=\"text-align: right;\">\n",
  473. " <th></th>\n",
  474. " <th>dataset</th>\n",
  475. " <th>grapheme</th>\n",
  476. " <th>phoneme</th>\n",
  477. " <th>homograph word</th>\n",
  478. " <th>pronunciation</th>\n",
  479. " </tr>\n",
  480. " </thead>\n",
  481. " <tbody>\n",
  482. " <tr>\n",
  483. " <th>0</th>\n",
  484. " <td>homograph</td>\n",
  485. " <td>من قدر تو را می‌دانم</td>\n",
  486. " <td>man qadr-e to rA mi-dAnam</td>\n",
  487. " <td>قدر</td>\n",
  488. " <td>qadr</td>\n",
  489. " </tr>\n",
  490. " <tr>\n",
  491. " <th>1</th>\n",
  492. " <td>homograph</td>\n",
  493. " <td>از قضای الهی به قدر الهی پناه می‌برم</td>\n",
  494. " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
  495. " <td>قدر</td>\n",
  496. " <td>qadar</td>\n",
  497. " </tr>\n",
  498. " <tr>\n",
  499. " <th>2</th>\n",
  500. " <td>homograph</td>\n",
  501. " <td>به دست و صورتم کرم زدم</td>\n",
  502. " <td>be dast-o suratam kerem zadam</td>\n",
  503. " <td>کرم</td>\n",
  504. " <td>kerem</td>\n",
  505. " </tr>\n",
  506. " </tbody>\n",
  507. "</table>\n",
  508. "</div>\n",
  509. " <div class=\"colab-df-buttons\">\n",
  510. "\n",
  511. " <div class=\"colab-df-container\">\n",
  512. " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-672a1961-0329-4c15-a7b4-ad09056c0997')\"\n",
  513. " title=\"Convert this dataframe to an interactive table.\"\n",
  514. " style=\"display:none;\">\n",
  515. "\n",
  516. " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
  517. " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
  518. " </svg>\n",
  519. " </button>\n",
  520. "\n",
  521. " <style>\n",
  522. " .colab-df-container {\n",
  523. " display:flex;\n",
  524. " gap: 12px;\n",
  525. " }\n",
  526. "\n",
  527. " .colab-df-convert {\n",
  528. " background-color: #E8F0FE;\n",
  529. " border: none;\n",
  530. " border-radius: 50%;\n",
  531. " cursor: pointer;\n",
  532. " display: none;\n",
  533. " fill: #1967D2;\n",
  534. " height: 32px;\n",
  535. " padding: 0 0 0 0;\n",
  536. " width: 32px;\n",
  537. " }\n",
  538. "\n",
  539. " .colab-df-convert:hover {\n",
  540. " background-color: #E2EBFA;\n",
  541. " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  542. " fill: #174EA6;\n",
  543. " }\n",
  544. "\n",
  545. " .colab-df-buttons div {\n",
  546. " margin-bottom: 4px;\n",
  547. " }\n",
  548. "\n",
  549. " [theme=dark] .colab-df-convert {\n",
  550. " background-color: #3B4455;\n",
  551. " fill: #D2E3FC;\n",
  552. " }\n",
  553. "\n",
  554. " [theme=dark] .colab-df-convert:hover {\n",
  555. " background-color: #434B5C;\n",
  556. " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
  557. " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
  558. " fill: #FFFFFF;\n",
  559. " }\n",
  560. " </style>\n",
  561. "\n",
  562. " <script>\n",
  563. " const buttonEl =\n",
  564. " document.querySelector('#df-672a1961-0329-4c15-a7b4-ad09056c0997 button.colab-df-convert');\n",
  565. " buttonEl.style.display =\n",
  566. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  567. "\n",
  568. " async function convertToInteractive(key) {\n",
  569. " const element = document.querySelector('#df-672a1961-0329-4c15-a7b4-ad09056c0997');\n",
  570. " const dataTable =\n",
  571. " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
  572. " [key], {});\n",
  573. " if (!dataTable) return;\n",
  574. "\n",
  575. " const docLinkHtml = 'Like what you see? Visit the ' +\n",
  576. " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
  577. " + ' to learn more about interactive tables.';\n",
  578. " element.innerHTML = '';\n",
  579. " dataTable['output_type'] = 'display_data';\n",
  580. " await google.colab.output.renderOutput(dataTable, element);\n",
  581. " const docLink = document.createElement('div');\n",
  582. " docLink.innerHTML = docLinkHtml;\n",
  583. " element.appendChild(docLink);\n",
  584. " }\n",
  585. " </script>\n",
  586. " </div>\n",
  587. "\n",
  588. "\n",
  589. " <div id=\"df-a2ffba04-53d0-4761-8940-297fc38c3d3f\">\n",
  590. " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-a2ffba04-53d0-4761-8940-297fc38c3d3f')\"\n",
  591. " title=\"Suggest charts\"\n",
  592. " style=\"display:none;\">\n",
  593. "\n",
  594. "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
  595. " width=\"24px\">\n",
  596. " <g>\n",
  597. " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
  598. " </g>\n",
  599. "</svg>\n",
  600. " </button>\n",
  601. "\n",
  602. "<style>\n",
  603. " .colab-df-quickchart {\n",
  604. " --bg-color: #E8F0FE;\n",
  605. " --fill-color: #1967D2;\n",
  606. " --hover-bg-color: #E2EBFA;\n",
  607. " --hover-fill-color: #174EA6;\n",
  608. " --disabled-fill-color: #AAA;\n",
  609. " --disabled-bg-color: #DDD;\n",
  610. " }\n",
  611. "\n",
  612. " [theme=dark] .colab-df-quickchart {\n",
  613. " --bg-color: #3B4455;\n",
  614. " --fill-color: #D2E3FC;\n",
  615. " --hover-bg-color: #434B5C;\n",
  616. " --hover-fill-color: #FFFFFF;\n",
  617. " --disabled-bg-color: #3B4455;\n",
  618. " --disabled-fill-color: #666;\n",
  619. " }\n",
  620. "\n",
  621. " .colab-df-quickchart {\n",
  622. " background-color: var(--bg-color);\n",
  623. " border: none;\n",
  624. " border-radius: 50%;\n",
  625. " cursor: pointer;\n",
  626. " display: none;\n",
  627. " fill: var(--fill-color);\n",
  628. " height: 32px;\n",
  629. " padding: 0;\n",
  630. " width: 32px;\n",
  631. " }\n",
  632. "\n",
  633. " .colab-df-quickchart:hover {\n",
  634. " background-color: var(--hover-bg-color);\n",
  635. " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
  636. " fill: var(--button-hover-fill-color);\n",
  637. " }\n",
  638. "\n",
  639. " .colab-df-quickchart-complete:disabled,\n",
  640. " .colab-df-quickchart-complete:disabled:hover {\n",
  641. " background-color: var(--disabled-bg-color);\n",
  642. " fill: var(--disabled-fill-color);\n",
  643. " box-shadow: none;\n",
  644. " }\n",
  645. "\n",
  646. " .colab-df-spinner {\n",
  647. " border: 2px solid var(--fill-color);\n",
  648. " border-color: transparent;\n",
  649. " border-bottom-color: var(--fill-color);\n",
  650. " animation:\n",
  651. " spin 1s steps(1) infinite;\n",
  652. " }\n",
  653. "\n",
  654. " @keyframes spin {\n",
  655. " 0% {\n",
  656. " border-color: transparent;\n",
  657. " border-bottom-color: var(--fill-color);\n",
  658. " border-left-color: var(--fill-color);\n",
  659. " }\n",
  660. " 20% {\n",
  661. " border-color: transparent;\n",
  662. " border-left-color: var(--fill-color);\n",
  663. " border-top-color: var(--fill-color);\n",
  664. " }\n",
  665. " 30% {\n",
  666. " border-color: transparent;\n",
  667. " border-left-color: var(--fill-color);\n",
  668. " border-top-color: var(--fill-color);\n",
  669. " border-right-color: var(--fill-color);\n",
  670. " }\n",
  671. " 40% {\n",
  672. " border-color: transparent;\n",
  673. " border-right-color: var(--fill-color);\n",
  674. " border-top-color: var(--fill-color);\n",
  675. " }\n",
  676. " 60% {\n",
  677. " border-color: transparent;\n",
  678. " border-right-color: var(--fill-color);\n",
  679. " }\n",
  680. " 80% {\n",
  681. " border-color: transparent;\n",
  682. " border-right-color: var(--fill-color);\n",
  683. " border-bottom-color: var(--fill-color);\n",
  684. " }\n",
  685. " 90% {\n",
  686. " border-color: transparent;\n",
  687. " border-bottom-color: var(--fill-color);\n",
  688. " }\n",
  689. " }\n",
  690. "</style>\n",
  691. "\n",
  692. " <script>\n",
  693. " async function quickchart(key) {\n",
  694. " const quickchartButtonEl =\n",
  695. " document.querySelector('#' + key + ' button');\n",
  696. " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
  697. " quickchartButtonEl.classList.add('colab-df-spinner');\n",
  698. " try {\n",
  699. " const charts = await google.colab.kernel.invokeFunction(\n",
  700. " 'suggestCharts', [key], {});\n",
  701. " } catch (error) {\n",
  702. " console.error('Error during call to suggestCharts:', error);\n",
  703. " }\n",
  704. " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
  705. " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
  706. " }\n",
  707. " (() => {\n",
  708. " let quickchartButtonEl =\n",
  709. " document.querySelector('#df-a2ffba04-53d0-4761-8940-297fc38c3d3f button');\n",
  710. " quickchartButtonEl.style.display =\n",
  711. " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
  712. " })();\n",
  713. " </script>\n",
  714. " </div>\n",
  715. " </div>\n",
  716. " </div>\n"
  717. ],
  718. "application/vnd.google.colaboratory.intrinsic+json": {
  719. "type": "dataframe",
  720. "variable_name": "sentence_bench",
  721. "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
  722. }
  723. },
  724. "metadata": {},
  725. "execution_count": 12
  726. }
  727. ]
  728. },
  729. {
  730. "cell_type": "markdown",
  731. "metadata": {
  732. "id": "wDV7ysXf2b_H"
  733. },
  734. "source": [
  735. "### Get ManaTTS"
  736. ]
  737. },
  738. {
  739. "cell_type": "code",
  740. "execution_count": null,
  741. "metadata": {
  742. "colab": {
  743. "base_uri": "https://localhost:8080/"
  744. },
  745. "id": "TcL5ZLvSSnVB",
  746. "outputId": "d4bcec8f-566b-4574-d9a4-9d2f893fba98"
  747. },
  748. "outputs": [
  749. {
  750. "output_type": "execute_result",
  751. "data": {
  752. "text/plain": [
  753. "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
  754. " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
  755. " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
  756. " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
  757. " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
  758. " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
  759. ]
  760. },
  761. "metadata": {},
  762. "execution_count": 13
  763. }
  764. ],
  765. "source": [
  766. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
  767. "\n",
  768. "# Convert to a list of tuples\n",
  769. "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  770. "\n",
  771. "mana_evaluation_data[:3]"
  772. ]
  773. },
  774. {
  775. "cell_type": "markdown",
  776. "metadata": {
  777. "id": "Jjacw9Mp2eoX"
  778. },
  779. "source": [
  780. "### Get CommonVoice"
  781. ]
  782. },
  783. {
  784. "cell_type": "code",
  785. "execution_count": null,
  786. "metadata": {
  787. "id": "-yQnqCGw26sk",
  788. "colab": {
  789. "base_uri": "https://localhost:8080/"
  790. },
  791. "outputId": "d8c72ac5-e590-4cf3-fcdd-4e14054e323e"
  792. },
  793. "outputs": [
  794. {
  795. "output_type": "execute_result",
  796. "data": {
  797. "text/plain": [
  798. "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
  799. " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
  800. " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
  801. " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
  802. " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
  803. ]
  804. },
  805. "metadata": {},
  806. "execution_count": 14
  807. }
  808. ],
  809. "source": [
  810. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
  811. "\n",
  812. "# Convert to a list of tuples\n",
  813. "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  814. "\n",
  815. "commonvoice_evaluation_data[:3]"
  816. ]
  817. },
  818. {
  819. "cell_type": "markdown",
  820. "metadata": {
  821. "id": "ciSPyhRc3Rvo"
  822. },
  823. "source": [
  824. "### Get Homograph"
  825. ]
  826. },
  827. {
  828. "cell_type": "code",
  829. "execution_count": null,
  830. "metadata": {
  831. "id": "XlFc5JbN3Rvz",
  832. "colab": {
  833. "base_uri": "https://localhost:8080/"
  834. },
  835. "outputId": "7cf07ce9-3232-4a2c-c5fd-70ad316d79b0"
  836. },
  837. "outputs": [
  838. {
  839. "output_type": "execute_result",
  840. "data": {
  841. "text/plain": [
  842. "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
  843. " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
  844. " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
  845. " 'قدر',\n",
  846. " 'qadar'),\n",
  847. " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
  848. ]
  849. },
  850. "metadata": {},
  851. "execution_count": 15
  852. }
  853. ],
  854. "source": [
  855. "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
  856. "\n",
  857. "# Convert to a list of tuples\n",
  858. "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
  859. "\n",
  860. "homograph_evaluation_data[:3]"
  861. ]
  862. },
  863. {
  864. "cell_type": "markdown",
  865. "metadata": {
  866. "id": "R6PE5ds45TPr"
  867. },
  868. "source": [
  869. "# Evaluate Method Outputs"
  870. ]
  871. },
  872. {
  873. "cell_type": "markdown",
  874. "metadata": {
  875. "id": "y73zFlRGIbt9"
  876. },
  877. "source": [
  878. "## PER Evaluation"
  879. ]
  880. },
  881. {
  882. "cell_type": "code",
  883. "execution_count": null,
  884. "metadata": {
  885. "id": "ItuviO3w5Vzv"
  886. },
  887. "outputs": [],
  888. "source": [
  889. "def remove_non_word_chars(text):\n",
  890. " pattern = r'[^\\w\\s\\?]'\n",
  891. " cleaned_text = re.sub(pattern, ' ', text)\n",
  892. " return cleaned_text"
  893. ]
  894. },
  895. {
  896. "cell_type": "code",
  897. "execution_count": null,
  898. "metadata": {
  899. "id": "syQCurXu51TO"
  900. },
  901. "outputs": [],
  902. "source": [
  903. "def remove_white_spaces(text):\n",
  904. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  905. " return cleaned_text.strip()"
  906. ]
  907. },
  908. {
  909. "cell_type": "code",
  910. "execution_count": null,
  911. "metadata": {
  912. "id": "V7APkVM053RP"
  913. },
  914. "outputs": [],
  915. "source": [
  916. "def get_word_only_text(text):\n",
  917. " word_only_text = remove_non_word_chars(text)\n",
  918. " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
  919. "\n",
  920. " return extra_space_removed_text"
  921. ]
  922. },
  923. {
  924. "cell_type": "code",
  925. "execution_count": null,
  926. "metadata": {
  927. "id": "ROomKSao57vy"
  928. },
  929. "outputs": [],
  930. "source": [
  931. "def get_texts_cer(reference, model_output):\n",
  932. " # Preprocess input texts to only contain word characters\n",
  933. " word_only_reference = get_word_only_text(reference)\n",
  934. " word_only_output = get_word_only_text(model_output)\n",
  935. "\n",
  936. " # Return +infinity for CER if any of the texts is empty\n",
  937. " if not word_only_reference.strip() or not word_only_output.strip():\n",
  938. " return float('inf')\n",
  939. "\n",
  940. " return cer(word_only_reference, word_only_output)"
  941. ]
  942. },
  943. {
  944. "cell_type": "code",
  945. "execution_count": null,
  946. "metadata": {
  947. "id": "4vHLUjp48hc3"
  948. },
  949. "outputs": [],
  950. "source": [
  951. "def get_avg_cer_of_method(method_outputs, references):\n",
  952. " cers = []\n",
  953. " for idx, o in enumerate(method_outputs):\n",
  954. " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
  955. " if cer != float('inf'):\n",
  956. " cers.append(cer)\n",
  957. "\n",
  958. " return sum(cers) / len(cers)"
  959. ]
  960. },
  961. {
  962. "cell_type": "markdown",
  963. "metadata": {
  964. "id": "oBgNtpFQDwku"
  965. },
  966. "source": [
  967. "## Homograph Evaluation"
  968. ]
  969. },
  970. {
  971. "cell_type": "code",
  972. "execution_count": null,
  973. "metadata": {
  974. "id": "J445ULEvEEDn"
  975. },
  976. "outputs": [],
  977. "source": [
  978. "def get_homograph_performance(outputs, references):\n",
  979. " corrects = 0\n",
  980. " total = 0\n",
  981. "\n",
  982. " for idx, (g, p, homograph, right) in enumerate(references):\n",
  983. " if homograph != '':\n",
  984. " total += 1\n",
  985. " if right in outputs[idx]:\n",
  986. " corrects += 1\n",
  987. "\n",
  988. " return corrects / total"
  989. ]
  990. },
  991. {
  992. "cell_type": "markdown",
  993. "metadata": {
  994. "id": "JGEUIrbi9kNH"
  995. },
  996. "source": [
  997. "# Full bench"
  998. ]
  999. },
  1000. {
  1001. "cell_type": "code",
  1002. "execution_count": null,
  1003. "metadata": {
  1004. "id": "fGzQvL8V9mln"
  1005. },
  1006. "outputs": [],
  1007. "source": [
  1008. "benchmark = []\n",
  1009. "\n",
  1010. "for g, p in mana_evaluation_data:\n",
  1011. " benchmark.append((g, p, '', ''))\n",
  1012. "\n",
  1013. "for g, p in commonvoice_evaluation_data:\n",
  1014. " benchmark.append((g, p, '', ''))\n",
  1015. "\n",
  1016. "for g, p, w, r in homograph_evaluation_data:\n",
  1017. " benchmark.append((g, p, w, r))\n",
  1018. "\n",
  1019. "benchmark = benchmark[:400]"
  1020. ]
  1021. },
  1022. {
  1023. "cell_type": "code",
  1024. "execution_count": null,
  1025. "metadata": {
  1026. "id": "4jlXFt8tCPWB"
  1027. },
  1028. "outputs": [],
  1029. "source": [
  1030. "def print_all_metrics(predictions):\n",
  1031. " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
  1032. " # acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)\n",
  1033. " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
  1034. "\n",
  1035. " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
  1036. " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
  1037. ]
  1038. },
  1039. {
  1040. "cell_type": "markdown",
  1041. "source": [
  1042. "# Inference"
  1043. ],
  1044. "metadata": {
  1045. "id": "fTRgGM_8_Fwg"
  1046. }
  1047. },
  1048. {
  1049. "cell_type": "code",
  1050. "source": [
  1051. "graphemes = [item[0] for item in benchmark]"
  1052. ],
  1053. "metadata": {
  1054. "id": "17lrgWh__Mzr"
  1055. },
  1056. "execution_count": null,
  1057. "outputs": []
  1058. },
  1059. {
  1060. "cell_type": "code",
  1061. "source": [
  1062. "import time\n",
  1063. "\n",
  1064. "start_time = time.time()\n",
  1065. "\n",
  1066. "outputs = g2p.generate(graphemes, use_rules=True)\n",
  1067. "\n",
  1068. "total_time = time.time() - start_time\n",
  1069. "avg_time = total_time / len(graphemes) if len(graphemes) > 0 else 0"
  1070. ],
  1071. "metadata": {
  1072. "id": "ajqTWtNb_HBd"
  1073. },
  1074. "execution_count": null,
  1075. "outputs": []
  1076. },
  1077. {
  1078. "cell_type": "markdown",
  1079. "source": [
  1080. "# Mapping"
  1081. ],
  1082. "metadata": {
  1083. "id": "jPXWBZ4R_bGs"
  1084. }
  1085. },
  1086. {
  1087. "cell_type": "code",
  1088. "source": [
  1089. "mapped_outputs = []\n",
  1090. "\n",
  1091. "# Define the replacements\n",
  1092. "replacements = {\n",
  1093. " 'a': 'A',\n",
  1094. " '$': 'S',\n",
  1095. " '/': 'a',\n",
  1096. " '1': '',\n",
  1097. " ';': 'Z',\n",
  1098. " '@': '?',\n",
  1099. " 'c': 'C'\n",
  1100. "}\n",
  1101. "\n",
  1102. "# Apply replacements\n",
  1103. "mapped_outputs = [\n",
  1104. " ''.join(replacements.get(char, char) for char in output)\n",
  1105. " for output in outputs\n",
  1106. "]"
  1107. ],
  1108. "metadata": {
  1109. "id": "c8C2sJjJA4na"
  1110. },
  1111. "execution_count": null,
  1112. "outputs": []
  1113. },
  1114. {
  1115. "cell_type": "markdown",
  1116. "source": [
  1117. "# Results"
  1118. ],
  1119. "metadata": {
  1120. "id": "JAIAobLFCKCr"
  1121. }
  1122. },
  1123. {
  1124. "cell_type": "code",
  1125. "source": [
  1126. "print_all_metrics(mapped_outputs)\n",
  1127. "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
  1128. "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)+\")"
  1129. ],
  1130. "metadata": {
  1131. "colab": {
  1132. "base_uri": "https://localhost:8080/"
  1133. },
  1134. "id": "CEs_TODaAFHO",
  1135. "outputId": "e2f49ad9-e667-49b7-ebfc-c2480434a3cc"
  1136. },
  1137. "execution_count": null,
  1138. "outputs": [
  1139. {
  1140. "output_type": "stream",
  1141. "name": "stdout",
  1142. "text": [
  1143. "PER: \t\t\t4.8063\n",
  1144. "HOMOGRAPH: \t\t47.1698\n",
  1145. "TOTAL TIME:\t\t173.3057 (s)\n",
  1146. "AVG TIME:\t\t0.4333 (s)+\n"
  1147. ]
  1148. }
  1149. ]
  1150. },
  1151. {
  1152. "cell_type": "markdown",
  1153. "source": [
  1154. "# Runs\n",
  1155. "\n",
  1156. "## First:\n",
  1157. "\n",
  1158. "```\n",
  1159. "PER: \t\t\t4.8063\n",
  1160. "HOMOGRAPH: \t\t47.1698\n",
  1161. "TOTAL TIME:\t\t193.6718 (s)\n",
  1162. "AVG TIME:\t\t0.4842 (s)+\n",
  1163. "```\n",
  1164. "\n",
  1165. "## Second\n",
  1166. "\n",
  1167. "```\n",
  1168. "PER: \t\t\t4.8063\n",
  1169. "HOMOGRAPH: \t\t47.1698\n",
  1170. "TOTAL TIME:\t\t166.1370 (s)\n",
  1171. "AVG TIME:\t\t0.4153 (s)+\n",
  1172. "```\n",
  1173. "\n",
  1174. "## Third\n",
  1175. "\n",
  1176. "```\n",
  1177. "PER: \t\t\t4.8063\n",
  1178. "HOMOGRAPH: \t\t47.1698\n",
  1179. "TOTAL TIME:\t\t180.0494 (s)\n",
  1180. "AVG TIME:\t\t0.4501 (s)+\n",
  1181. "```\n",
  1182. "\n",
  1183. "## Fourth\n",
  1184. "\n",
  1185. "```\n",
  1186. "PER: \t\t\t4.8063\n",
  1187. "HOMOGRAPH: \t\t47.1698\n",
  1188. "TOTAL TIME:\t\t179.7101 (s)\n",
  1189. "AVG TIME:\t\t0.4493 (s)+\n",
  1190. "```\n",
  1191. "\n",
  1192. "## Fifth\n",
  1193. "\n",
  1194. "```\n",
  1195. "PER: \t\t\t4.8063\n",
  1196. "HOMOGRAPH: \t\t47.1698\n",
  1197. "TOTAL TIME:\t\t173.3057 (s)\n",
  1198. "AVG TIME:\t\t0.4333 (s)+\n",
  1199. "```"
  1200. ],
  1201. "metadata": {
  1202. "id": "DeOaBaWEJI6x"
  1203. }
  1204. }
  1205. ]
  1206. }