123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "9sEfZoepGP8x"
- },
- "source": [
- "# Setup Environment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "id": "u6n8Hc1hQSy7",
- "outputId": "4b73a4e7-90d9-4be8-d800-7ee57c5d0d6b"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Collecting hazm==0.10.0\n",
- " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n",
- "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm==0.10.0)\n",
- " Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n",
- "Collecting flashtext<3.0,>=2.7 (from hazm==0.10.0)\n",
- " Downloading flashtext-2.7.tar.gz (14 kB)\n",
- " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- "Collecting gensim<5.0.0,>=4.3.1 (from hazm==0.10.0)\n",
- " Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)\n",
- "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (3.9.1)\n",
- "Collecting numpy==1.24.3 (from hazm==0.10.0)\n",
- " Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
- "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm==0.10.0)\n",
- " Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n",
- "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.11/dist-packages (from hazm==0.10.0) (1.6.1)\n",
- "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0)\n",
- " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
- "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm==0.10.0) (75.2.0)\n",
- "Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm==0.10.0)\n",
- " Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.11/dist-packages (from gensim<5.0.0,>=4.3.1->hazm==0.10.0) (7.1.0)\n",
- "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (8.1.8)\n",
- "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (1.5.0)\n",
- "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (2024.11.6)\n",
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk<4.0.0,>=3.8.1->hazm==0.10.0) (4.67.1)\n",
- "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm==0.10.0) (3.6.0)\n",
- "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm==0.10.0) (1.17.2)\n",
- "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.7/26.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hBuilding wheels for collected packages: flashtext\n",
- " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9300 sha256=78287f302c5e4a8386f41636f5409c9340612d4ade63fc8a79516d6eb9621d2f\n",
- " Stored in directory: /root/.cache/pip/wheels/49/20/47/f03dfa8a7239c54cbc44ff7389eefbf888d2c1873edaaec888\n",
- "Successfully built flashtext\n",
- "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, scipy, fasttext-wheel, gensim, hazm\n",
- " Attempting uninstall: numpy\n",
- " Found existing installation: numpy 2.0.2\n",
- " Uninstalling numpy-2.0.2:\n",
- " Successfully uninstalled numpy-2.0.2\n",
- " Attempting uninstall: scipy\n",
- " Found existing installation: scipy 1.15.2\n",
- " Uninstalling scipy-1.15.2:\n",
- " Successfully uninstalled scipy-1.15.2\n",
- "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
- "treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.\n",
- "pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\n",
- "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.\n",
- "jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\n",
- "albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
- "albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n",
- "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.\n",
- "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
- "blosc2 3.3.2 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.\n",
- "jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
- "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 gensim-4.3.3 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11 scipy-1.13.1\n"
- ]
- },
- {
- "output_type": "display_data",
- "data": {
- "application/vnd.colab-display-data+json": {
- "pip_warning": {
- "packages": [
- "numpy"
- ]
- },
- "id": "be4e054b011d4e0e930386438573d963"
- }
- },
- "metadata": {}
- }
- ],
- "source": [
- "! pip install hazm==0.10.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "iA2Jjex-KMqx",
- "outputId": "b69e9e85-54f8-49ea-e50c-8f62aca42a65"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Collecting numpy==1.26.0\n",
- " Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n",
- "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m89.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hInstalling collected packages: numpy\n",
- " Attempting uninstall: numpy\n",
- " Found existing installation: numpy 1.24.3\n",
- " Uninstalling numpy-1.24.3:\n",
- " Successfully uninstalled numpy-1.24.3\n",
- "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
- "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.26.0 which is incompatible.\n",
- "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.\n",
- "tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\u001b[0m\u001b[31m\n",
- "\u001b[0mSuccessfully installed numpy-1.26.0\n"
- ]
- }
- ],
- "source": [
- "!pip install numpy==1.26.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 54
- },
- "id": "QP854R4YHf4I",
- "outputId": "5f5efc6a-795d-4b7d-a9ae-6d079a614fac"
- },
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "<IPython.core.display.HTML object>"
- ],
- "text/html": [
- "\n",
- "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
- " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
- "</div>\n"
- ]
- },
- "metadata": {}
- }
- ],
- "source": [
- "from IPython.display import display, HTML\n",
- "\n",
- "display(HTML(\"\"\"\n",
- "<div style='color: white; background-color: #f44336; padding: 10px; border-radius: 5px;'>\n",
- " <strong>Please restart the notebook!</strong> Click on <b>Runtime</b> → <b>Restart session</b> and then re-run all cells.\n",
- "</div>\n",
- "\"\"\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "EVO9pn8Ou3o1"
- },
- "outputs": [],
- "source": [
- "!pip install -q --upgrade --no-cache-dir gdown"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "brKU69ZQvEiz",
- "outputId": "c6f840f0-01f7-4c1a-cf76-dbc5149f601c"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/235.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m235.5/235.8 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h"
- ]
- }
- ],
- "source": [
- "!pip install -q unidecode\n",
- "!pip install -q transformers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "grp-l-cbGNWm",
- "outputId": "fbb8d2e0-8a23-4493-e455-474241c45d6a"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Collecting jiwer\n",
- " Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)\n",
- "Requirement already satisfied: click>=8.1.8 in /usr/local/lib/python3.11/dist-packages (from jiwer) (8.1.8)\n",
- "Collecting rapidfuzz>=3.9.7 (from jiwer)\n",
- " Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
- "Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)\n",
- "Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n",
- "Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0\n"
- ]
- }
- ],
- "source": [
- "!pip install jiwer"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "dQ0osefGGSpJ"
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import re\n",
- "from jiwer import cer"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Nwt1YBYVqcva"
- },
- "source": [
- "# Setup Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "x-kHFEm8u8Xg"
- },
- "outputs": [],
- "source": [
- "!gdown -q 1Or8xx3KX-ZNqt0Ag_FUA2q8TWdxvY6Kr # The Checkpoint\n",
- "!unzip -q ge2pe-chpt.zip\n",
- "!rm ./ge2pe-chpt.zip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "CGVVxGpivULm"
- },
- "outputs": [],
- "source": [
- "!gdown -q 11Yb0QjyP2R3RvN1oSCX9m0DL2_bzDeZS # Parsivar for normalization\n",
- "!unzip -q ./Parsivar.zip\n",
- "!rm ./Parsivar.zip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "K-mPQF5ykcmF",
- "outputId": "8368d1b8-65b0-47f5-9727-8dd1f1da134b"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Downloading...\n",
- "From (original): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34\n",
- "From (redirected): https://drive.google.com/uc?id=1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34&confirm=t&uuid=d2dbee7b-e12b-460e-aef6-af169e136cdd\n",
- "To: /content/GE2PE.py\n",
- "\r 0% 0.00/4.96k [00:00<?, ?B/s]\r100% 4.96k/4.96k [00:00<00:00, 12.8MB/s]\n"
- ]
- }
- ],
- "source": [
- "! gdown 1OubKfFhLCVu-O43jfWyPQsZ4B2GNPM34 # GE2PE.py"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "VIRvJy8naB0f"
- },
- "outputs": [],
- "source": [
- "!sed -i 's+from collections import Iterable+from collections.abc import Iterable+g' /content/Parsivar/token_merger.py"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "Qs-J5B3ykaYz",
- "outputId": "eeb0c2ee-539f-44ee-c469-7eed7edca22b"
- },
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "['teste model t/bdil nevise be vaj', '@in ketabe @ali @/st']"
- ]
- },
- "metadata": {},
- "execution_count": 9
- }
- ],
- "source": [
- "from GE2PE import GE2PE\n",
- "\n",
- "g2p = GE2PE(model_path='/content/ge2pe-chpt')\n",
- "\n",
- "g2p.generate(['تست مدل تبدیل نویسه به واج', 'این کتابِ علی است'], use_rules=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "XjAPkfq7SF87"
- },
- "source": [
- "# Get Evaluation Data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "qwCG0jX-88nQ",
- "outputId": "ffe13c8d-3b96-479f-bf8d-04f031ca412d"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "--2025-05-13 06:47:23-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n",
- "Resolving huggingface.co (huggingface.co)... 18.164.174.23, 18.164.174.17, 18.164.174.55, ...\n",
- "Connecting to huggingface.co (huggingface.co)|18.164.174.23|:443... connected.\n",
- "HTTP request sent, awaiting response... 200 OK\n",
- "Length: 56026 (55K) [text/plain]\n",
- "Saving to: ‘SentenceBench.csv’\n",
- "\n",
- "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.71K --.-KB/s in 0.008s \n",
- "\n",
- "2025-05-13 06:47:23 (6.99 MB/s) - ‘SentenceBench.csv’ saved [56026/56026]\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "hJO-UAPDQvcb"
- },
- "outputs": [],
- "source": [
- "sentence_bench = pd.read_csv('SentenceBench.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 143
- },
- "id": "qlYbrnUa9LAN",
- "outputId": "d95b98b2-db8a-4192-a3e3-c8f154a17792"
- },
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " dataset grapheme \\\n",
- "0 homograph من قدر تو را میدانم \n",
- "1 homograph از قضای الهی به قدر الهی پناه میبرم \n",
- "2 homograph به دست و صورتم کرم زدم \n",
- "\n",
- " phoneme homograph word \\\n",
- "0 man qadr-e to rA mi-dAnam قدر \n",
- "1 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n",
- "2 be dast-o suratam kerem zadam کرم \n",
- "\n",
- " pronunciation \n",
- "0 qadr \n",
- "1 qadar \n",
- "2 kerem "
- ],
- "text/html": [
- "\n",
- " <div id=\"df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f\" class=\"colab-df-container\">\n",
- " <div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>dataset</th>\n",
- " <th>grapheme</th>\n",
- " <th>phoneme</th>\n",
- " <th>homograph word</th>\n",
- " <th>pronunciation</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>homograph</td>\n",
- " <td>من قدر تو را میدانم</td>\n",
- " <td>man qadr-e to rA mi-dAnam</td>\n",
- " <td>قدر</td>\n",
- " <td>qadr</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>homograph</td>\n",
- " <td>از قضای الهی به قدر الهی پناه میبرم</td>\n",
- " <td>?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram</td>\n",
- " <td>قدر</td>\n",
- " <td>qadar</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>homograph</td>\n",
- " <td>به دست و صورتم کرم زدم</td>\n",
- " <td>be dast-o suratam kerem zadam</td>\n",
- " <td>کرم</td>\n",
- " <td>kerem</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>\n",
- " <div class=\"colab-df-buttons\">\n",
- "\n",
- " <div class=\"colab-df-container\">\n",
- " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f')\"\n",
- " title=\"Convert this dataframe to an interactive table.\"\n",
- " style=\"display:none;\">\n",
- "\n",
- " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
- " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
- " </svg>\n",
- " </button>\n",
- "\n",
- " <style>\n",
- " .colab-df-container {\n",
- " display:flex;\n",
- " gap: 12px;\n",
- " }\n",
- "\n",
- " .colab-df-convert {\n",
- " background-color: #E8F0FE;\n",
- " border: none;\n",
- " border-radius: 50%;\n",
- " cursor: pointer;\n",
- " display: none;\n",
- " fill: #1967D2;\n",
- " height: 32px;\n",
- " padding: 0 0 0 0;\n",
- " width: 32px;\n",
- " }\n",
- "\n",
- " .colab-df-convert:hover {\n",
- " background-color: #E2EBFA;\n",
- " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
- " fill: #174EA6;\n",
- " }\n",
- "\n",
- " .colab-df-buttons div {\n",
- " margin-bottom: 4px;\n",
- " }\n",
- "\n",
- " [theme=dark] .colab-df-convert {\n",
- " background-color: #3B4455;\n",
- " fill: #D2E3FC;\n",
- " }\n",
- "\n",
- " [theme=dark] .colab-df-convert:hover {\n",
- " background-color: #434B5C;\n",
- " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
- " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
- " fill: #FFFFFF;\n",
- " }\n",
- " </style>\n",
- "\n",
- " <script>\n",
- " const buttonEl =\n",
- " document.querySelector('#df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f button.colab-df-convert');\n",
- " buttonEl.style.display =\n",
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
- "\n",
- " async function convertToInteractive(key) {\n",
- " const element = document.querySelector('#df-c166b69c-b2b6-4908-ba5c-c5fa0250c98f');\n",
- " const dataTable =\n",
- " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
- " [key], {});\n",
- " if (!dataTable) return;\n",
- "\n",
- " const docLinkHtml = 'Like what you see? Visit the ' +\n",
- " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
- " + ' to learn more about interactive tables.';\n",
- " element.innerHTML = '';\n",
- " dataTable['output_type'] = 'display_data';\n",
- " await google.colab.output.renderOutput(dataTable, element);\n",
- " const docLink = document.createElement('div');\n",
- " docLink.innerHTML = docLinkHtml;\n",
- " element.appendChild(docLink);\n",
- " }\n",
- " </script>\n",
- " </div>\n",
- "\n",
- "\n",
- " <div id=\"df-dea3061f-6364-43a5-8c24-98a030eec895\">\n",
- " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-dea3061f-6364-43a5-8c24-98a030eec895')\"\n",
- " title=\"Suggest charts\"\n",
- " style=\"display:none;\">\n",
- "\n",
- "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
- " width=\"24px\">\n",
- " <g>\n",
- " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
- " </g>\n",
- "</svg>\n",
- " </button>\n",
- "\n",
- "<style>\n",
- " .colab-df-quickchart {\n",
- " --bg-color: #E8F0FE;\n",
- " --fill-color: #1967D2;\n",
- " --hover-bg-color: #E2EBFA;\n",
- " --hover-fill-color: #174EA6;\n",
- " --disabled-fill-color: #AAA;\n",
- " --disabled-bg-color: #DDD;\n",
- " }\n",
- "\n",
- " [theme=dark] .colab-df-quickchart {\n",
- " --bg-color: #3B4455;\n",
- " --fill-color: #D2E3FC;\n",
- " --hover-bg-color: #434B5C;\n",
- " --hover-fill-color: #FFFFFF;\n",
- " --disabled-bg-color: #3B4455;\n",
- " --disabled-fill-color: #666;\n",
- " }\n",
- "\n",
- " .colab-df-quickchart {\n",
- " background-color: var(--bg-color);\n",
- " border: none;\n",
- " border-radius: 50%;\n",
- " cursor: pointer;\n",
- " display: none;\n",
- " fill: var(--fill-color);\n",
- " height: 32px;\n",
- " padding: 0;\n",
- " width: 32px;\n",
- " }\n",
- "\n",
- " .colab-df-quickchart:hover {\n",
- " background-color: var(--hover-bg-color);\n",
- " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
- " fill: var(--button-hover-fill-color);\n",
- " }\n",
- "\n",
- " .colab-df-quickchart-complete:disabled,\n",
- " .colab-df-quickchart-complete:disabled:hover {\n",
- " background-color: var(--disabled-bg-color);\n",
- " fill: var(--disabled-fill-color);\n",
- " box-shadow: none;\n",
- " }\n",
- "\n",
- " .colab-df-spinner {\n",
- " border: 2px solid var(--fill-color);\n",
- " border-color: transparent;\n",
- " border-bottom-color: var(--fill-color);\n",
- " animation:\n",
- " spin 1s steps(1) infinite;\n",
- " }\n",
- "\n",
- " @keyframes spin {\n",
- " 0% {\n",
- " border-color: transparent;\n",
- " border-bottom-color: var(--fill-color);\n",
- " border-left-color: var(--fill-color);\n",
- " }\n",
- " 20% {\n",
- " border-color: transparent;\n",
- " border-left-color: var(--fill-color);\n",
- " border-top-color: var(--fill-color);\n",
- " }\n",
- " 30% {\n",
- " border-color: transparent;\n",
- " border-left-color: var(--fill-color);\n",
- " border-top-color: var(--fill-color);\n",
- " border-right-color: var(--fill-color);\n",
- " }\n",
- " 40% {\n",
- " border-color: transparent;\n",
- " border-right-color: var(--fill-color);\n",
- " border-top-color: var(--fill-color);\n",
- " }\n",
- " 60% {\n",
- " border-color: transparent;\n",
- " border-right-color: var(--fill-color);\n",
- " }\n",
- " 80% {\n",
- " border-color: transparent;\n",
- " border-right-color: var(--fill-color);\n",
- " border-bottom-color: var(--fill-color);\n",
- " }\n",
- " 90% {\n",
- " border-color: transparent;\n",
- " border-bottom-color: var(--fill-color);\n",
- " }\n",
- " }\n",
- "</style>\n",
- "\n",
- " <script>\n",
- " async function quickchart(key) {\n",
- " const quickchartButtonEl =\n",
- " document.querySelector('#' + key + ' button');\n",
- " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
- " quickchartButtonEl.classList.add('colab-df-spinner');\n",
- " try {\n",
- " const charts = await google.colab.kernel.invokeFunction(\n",
- " 'suggestCharts', [key], {});\n",
- " } catch (error) {\n",
- " console.error('Error during call to suggestCharts:', error);\n",
- " }\n",
- " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
- " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
- " }\n",
- " (() => {\n",
- " let quickchartButtonEl =\n",
- " document.querySelector('#df-dea3061f-6364-43a5-8c24-98a030eec895 button');\n",
- " quickchartButtonEl.style.display =\n",
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
- " })();\n",
- " </script>\n",
- " </div>\n",
- " </div>\n",
- " </div>\n"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "sentence_bench",
- "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"homograph\",\n \"mana-tts\",\n \"commonvoice\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"\\u0622\\u06cc\\u0627 \\u0628\\u0627\\u06cc\\u062f \\u062d\\u0642\\u06cc\\u0642\\u062a \\u0631\\u0627 \\u0628\\u0647 \\u0622\\u0646\\u200c\\u0647\\u0627 \\u0628\\u06af\\u0648\\u06cc\\u06cc\\u0645\\u061f\",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0648\\u062f \\u0648 \\u0645\\u0647 \\u063a\\u0644\\u06cc\\u0638\\u06cc \\u062f\\u0631 \\u0645\\u062d\\u06cc\\u0637 \\u067e\\u06cc\\u0686\\u06cc\\u062f\\u0647 \\u0628\\u0648\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"?AyA bAyad haqiqat rA be ?AnhA beguyim\\u061f\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dud-o meh-e qalizi dar mohit piCide bud\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 101,\n \"samples\": [\n \"\\u06af\\u0631\\u06cc\\u0645\",\n \"\\u0633\\u0628\\u06a9\\u06cc\",\n \"\\u06a9\\u0645\\u06cc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 210,\n \"samples\": [\n \"darham\",\n \"Sum\",\n \"moSk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
- }
- },
- "metadata": {},
- "execution_count": 12
- }
- ],
- "source": [
- "sentence_bench.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "wDV7ysXf2b_H"
- },
- "source": [
- "### Get ManaTTS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "TcL5ZLvSSnVB",
- "outputId": "f4989c23-9afd-4aff-8346-e2b3f8838bd4"
- },
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n",
- " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA '),\n",
- " ('به نام بی\\u200cوپتیک یا عدسی دورنما آشنا شویم. ',\n",
- " 'be nAm-e biyoptik yA ?adasi-ye durnamA ?ASnA Savim'),\n",
- " ('دراین\\u200cصورت، انجام خودارزیابی و ارائه بازخورد بر عهده خودتان است. ',\n",
- " 'dar ?in surat ?anjAm-e xod?arzyAbi va ?erA?e-ye bAzxord bar ?ohde-ye xodetAn ?ast ')]"
- ]
- },
- "metadata": {},
- "execution_count": 13
- }
- ],
- "source": [
- "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n",
- "\n",
- "# Convert to a list of tuples\n",
- "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
- "\n",
- "mana_evaluation_data[:3]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Jjacw9Mp2eoX"
- },
- "source": [
- "### Get CommonVoice"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "-yQnqCGw26sk",
- "outputId": "afd35025-a4d8-4331-ad71-5637f5fd8191"
- },
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n",
- " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.'),\n",
- " ('پس از مدرسه کودکان به سوی خانه جست و خیز کردند.',\n",
- " 'pas ?az madrese kudakAn be suye xAne jast-o-xiz kardand.'),\n",
- " ('شما نگران زن و بچه این نباش.', 'SomA negarAn-e zan-o-baCCe-ye ?in nabAS.')]"
- ]
- },
- "metadata": {},
- "execution_count": 14
- }
- ],
- "source": [
- "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n",
- "\n",
- "# Convert to a list of tuples\n",
- "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
- "\n",
- "commonvoice_evaluation_data[:3]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "ciSPyhRc3Rvo"
- },
- "source": [
- "### Get Homograph"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "XlFc5JbN3Rvz",
- "outputId": "fbd4182f-2446-47ed-dbfb-c7ca9128320e"
- },
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr'),\n",
- " ('از قضای الهی به قدر الهی پناه می\\u200cبرم',\n",
- " '?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram',\n",
- " 'قدر',\n",
- " 'qadar'),\n",
- " ('به دست و صورتم کرم زدم', 'be dast-o suratam kerem zadam', 'کرم', 'kerem')]"
- ]
- },
- "metadata": {},
- "execution_count": 15
- }
- ],
- "source": [
- "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'homograph'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n",
- "\n",
- "# Convert to a list of tuples\n",
- "homograph_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n",
- "\n",
- "homograph_evaluation_data[:3]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "R6PE5ds45TPr"
- },
- "source": [
- "# Evaluate Method Outputs"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "y73zFlRGIbt9"
- },
- "source": [
- "## PER Evaluation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ItuviO3w5Vzv"
- },
- "outputs": [],
- "source": [
- "def remove_non_word_chars(text):\n",
- " pattern = r'[^\\w\\s\\?]'\n",
- " cleaned_text = re.sub(pattern, ' ', text)\n",
- " return cleaned_text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "syQCurXu51TO"
- },
- "outputs": [],
- "source": [
- "def remove_white_spaces(text):\n",
- " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
- " return cleaned_text.strip()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "V7APkVM053RP"
- },
- "outputs": [],
- "source": [
- "def get_word_only_text(text):\n",
- " word_only_text = remove_non_word_chars(text)\n",
- " extra_space_removed_text = remove_white_spaces(word_only_text)\n",
- "\n",
- " return extra_space_removed_text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ROomKSao57vy"
- },
- "outputs": [],
- "source": [
- "def get_texts_cer(reference, model_output):\n",
- " # Preprocess input texts to only contain word characters\n",
- " word_only_reference = get_word_only_text(reference)\n",
- " word_only_output = get_word_only_text(model_output)\n",
- "\n",
- " # Return +infinity for CER if any of the texts is empty\n",
- " if not word_only_reference.strip() or not word_only_output.strip():\n",
- " return float('inf')\n",
- "\n",
- " return cer(word_only_reference, word_only_output)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "4vHLUjp48hc3"
- },
- "outputs": [],
- "source": [
- "def get_avg_cer_of_method(method_outputs, references):\n",
- " cers = []\n",
- " for idx, o in enumerate(method_outputs):\n",
- " cer = get_texts_cer(o.replace('-', ''), references[idx][1].replace('-', ''))\n",
- " if cer != float('inf'):\n",
- " cers.append(cer)\n",
- "\n",
- " return sum(cers) / len(cers)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "oBgNtpFQDwku"
- },
- "source": [
- "## Homograph Evaluation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "J445ULEvEEDn"
- },
- "outputs": [],
- "source": [
- "def get_homograph_performance(outputs, references):\n",
- " corrects = 0\n",
- " total = 0\n",
- "\n",
- " for idx, (g, p, homograph, right) in enumerate(references):\n",
- " if homograph != '':\n",
- " total += 1\n",
- " if right in outputs[idx]:\n",
- " corrects += 1\n",
- "\n",
- " return corrects / total"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "JGEUIrbi9kNH"
- },
- "source": [
- "# Full bench"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "fGzQvL8V9mln"
- },
- "outputs": [],
- "source": [
- "benchmark = []\n",
- "\n",
- "for g, p in mana_evaluation_data:\n",
- " benchmark.append((g, p, '', ''))\n",
- "\n",
- "for g, p in commonvoice_evaluation_data:\n",
- " benchmark.append((g, p, '', ''))\n",
- "\n",
- "for g, p, w, r in homograph_evaluation_data:\n",
- " benchmark.append((g, p, w, r))\n",
- "\n",
- "benchmark = benchmark[:400]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "4jlXFt8tCPWB"
- },
- "outputs": [],
- "source": [
- "def print_all_metrics(predictions):\n",
- " per = get_avg_cer_of_method(predictions, benchmark) * 100\n",
- " # acc, prec, recall = get_phonetic_model_performance(predictions, benchmark)\n",
- " homograph = get_homograph_performance(predictions, benchmark) * 100\n",
- "\n",
- " print(f\"PER: \\t\\t\\t{per:.4f}\")\n",
- " print(f\"HOMOGRAPH: \\t\\t{homograph:.4f}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "fTRgGM_8_Fwg"
- },
- "source": [
- "# Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "17lrgWh__Mzr"
- },
- "outputs": [],
- "source": [
- "graphemes = [item[0] for item in benchmark]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ajqTWtNb_HBd"
- },
- "outputs": [],
- "source": [
- "import time\n",
- "\n",
- "start_time = time.time()\n",
- "\n",
- "outputs = g2p.generate(graphemes, use_rules=True)\n",
- "\n",
- "total_time = time.time() - start_time\n",
- "avg_time = total_time / len(graphemes) if len(graphemes) > 0 else 0"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "jPXWBZ4R_bGs"
- },
- "source": [
- "# Mapping"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "c8C2sJjJA4na"
- },
- "outputs": [],
- "source": [
- "mapped_outputs = []\n",
- "\n",
- "# Define the replacements\n",
- "replacements = {\n",
- " 'a': 'A',\n",
- " '$': 'S',\n",
- " '/': 'a',\n",
- " '1': '',\n",
- " ';': 'Z',\n",
- " '@': '?',\n",
- " 'c': 'C'\n",
- "}\n",
- "\n",
- "# Apply replacements\n",
- "mapped_outputs = [\n",
- " ''.join(replacements.get(char, char) for char in output)\n",
- " for output in outputs\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "JAIAobLFCKCr"
- },
- "source": [
- "# Results"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "CEs_TODaAFHO",
- "outputId": "48aa9c83-811c-4eb6-a829-ef1fff0ea36b"
- },
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t174.7042 (s)\n",
- "AVG TIME:\t\t0.4368 (s)+\n"
- ]
- }
- ],
- "source": [
- "print_all_metrics(mapped_outputs)\n",
- "print(f\"TOTAL TIME:\\t\\t{total_time:.4f} (s)\")\n",
- "print(f\"AVG TIME:\\t\\t{avg_time:.4f} (s)+\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "DeOaBaWEJI6x"
- },
- "source": [
- "# Runs\n",
- "\n",
- "## First:\n",
- "\n",
- "```\n",
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t182.5777 (s)\n",
- "AVG TIME:\t\t0.4564 (s)+\n",
- "```\n",
- "\n",
- "## Second\n",
- "\n",
- "```\n",
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t191.1550 (s)\n",
- "AVG TIME:\t\t0.4779 (s)+\n",
- "```\n",
- "\n",
- "## Third\n",
- "\n",
- "```\n",
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t173.8426 (s)\n",
- "AVG TIME:\t\t0.4346 (s)+\n",
- "```\n",
- "\n",
- "## Fourth\n",
- "\n",
- "```\n",
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t172.3748 (s)\n",
- "AVG TIME:\t\t0.4309 (s)+\n",
- "```\n",
- "\n",
- "## Fifth\n",
- "\n",
- "```\n",
- "PER: \t\t\t3.9804\n",
- "HOMOGRAPH: \t\t76.8868\n",
- "TOTAL TIME:\t\t174.7042 (s)\n",
- "AVG TIME:\t\t0.4368 (s)+\n",
- "```"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [
- "AdU8VMTIOWLZ",
- "a3zuvbqx2l68",
- "XjAPkfq7SF87",
- "R6PE5ds45TPr",
- "y73zFlRGIbt9",
- "oBgNtpFQDwku",
- "JGEUIrbi9kNH",
- "fTRgGM_8_Fwg",
- "jPXWBZ4R_bGs"
- ],
- "provenance": []
- },
- "gpuClass": "standard",
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "language_info": {
- "name": "python"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
|