{ "cells": [ { "cell_type": "markdown", "source": [ "# Setup Environment" ], "metadata": { "id": "ABgLYF9R8viP" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "I3VDTSYocPBc", "outputId": "71f5f740-0871-4bbf-9579-f7dacf9a33ef" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting hazm\n", " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n", "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)\n", " Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n", "Collecting flashtext<3.0,>=2.7 (from hazm)\n", " Downloading flashtext-2.7.tar.gz (14 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (4.3.3)\n", "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (3.9.1)\n", "Collecting numpy==1.24.3 (from hazm)\n", " Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)\n", " Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.10/dist-packages (from hazm) (1.6.0)\n", "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)\n", " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm) (75.1.0)\n", "Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (1.13.1)\n", "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (7.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (8.1.8)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (1.4.2)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (2024.11.6)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (4.67.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm) (3.5.0)\n", "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm) (1.17.0)\n", "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m59.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: flashtext\n", " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9298 sha256=00e71c3668bd8e587a801eba10ad77cd1062a93b11134d8064513370e729d01f\n", " Stored in directory: /root/.cache/pip/wheels/bc/be/39/c37ad168eb2ff644c9685f52554440372129450f0b8ed203dd\n", "Successfully built flashtext\n", "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, fasttext-wheel, hazm\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 1.26.4\n", " Uninstalling numpy-1.26.4:\n", " Successfully uninstalled numpy-1.26.4\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "albucore 0.0.19 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "albumentations 1.4.20 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "pymc 5.19.1 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11\n" ] }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "numpy" ] }, "id": "3a509826026a4a01ac258472d31eeec1" } }, "metadata": {} } ], "source": [ "!pip install hazm # Requires restart." ] }, { "cell_type": "code", "source": [ "!pip install groq\n", "!pip install jiwer" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mjTNnSAomVHS", "outputId": "e9737eea-df8e-474c-9725-b9923ba9f635" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting groq\n", " Downloading groq-0.14.0-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from groq) (3.7.1)\n", "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from groq) (1.9.0)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from groq) (0.28.1)\n", "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from groq) (2.10.4)\n", "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from groq) (1.3.1)\n", "Requirement already satisfied: typing-extensions<5,>=4.10 in /usr/local/lib/python3.10/dist-packages (from groq) (4.12.2)\n", "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->groq) (3.10)\n", "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->groq) (1.2.2)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->groq) (2024.12.14)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->groq) (1.0.7)\n", "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->groq) (0.14.0)\n", "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->groq) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->groq) (2.27.2)\n", "Downloading groq-0.14.0-py3-none-any.whl (109 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m109.5/109.5 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: groq\n", "Successfully installed groq-0.14.0\n", "Collecting jiwer\n", " Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)\n", "Requirement already satisfied: click<9.0.0,>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (8.1.8)\n", "Collecting rapidfuzz<4,>=3 (from jiwer)\n", " Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", "Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)\n", "Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n", "Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "from groq import Groq\n", "import re\n", "from difflib import SequenceMatcher\n", "from jiwer import cer\n", "from tqdm import tqdm" ], "metadata": { "id": "Cj9xpfNkkusV" }, "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Zg2EzX4hOReJ" }, "source": [ "# Setup LLM" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "HsnrlOZprsoK" }, "outputs": [], "source": [ "def get_response(messages):\n", " client = Groq(\n", " api_key= '', # Insert API key\n", " )\n", "\n", " while True:\n", " try:\n", " response = client.chat.completions.create(\n", " model='llama3-70b-8192',\n", " messages=messages,\n", " )\n", "\n", " response = response.choices[0].message.content\n", " return response\n", "\n", " except Exception as e:\n", " print(e)\n", " continue\n" ] }, { "cell_type": "markdown", "metadata": { "id": "AdU8VMTIOWLZ" }, "source": [ "# Get Dictionary" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xm_St_kWOYI-", "outputId": "6d973e40-faa8-4cf5-b52b-7a6af76aabe0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-01-09 20:32:04-- https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv\n", "Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...\n", "Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 7945406 (7.6M) [text/plain]\n", "Saving to: ‘KaamelDict.csv’\n", "\n", "KaamelDict.csv 100%[===================>] 7.58M 11.5MB/s in 0.7s \n", "\n", "2025-01-09 20:32:05 (11.5 MB/s) - ‘KaamelDict.csv’ saved [7945406/7945406]\n", "\n" ] } ], "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "dGYh5bDyRfTg" }, "outputs": [], "source": [ "dict_path = \"KaamelDict.csv\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "WV2x_iLQRhHI" }, "outputs": [], "source": [ "dict_df = pd.read_csv(dict_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "OJTyOEoMR-cV" }, "outputs": [], "source": [ "kaamel_dict = {}\n", "\n", "for idx, row in dict_df.iterrows():\n", " g, p = row['grapheme'], ''.join(eval(row['phoneme']))\n", " if g not in kaamel_dict:\n", " kaamel_dict[g] = []\n", " kaamel_dict[g].append(p)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "sIUx62uakGV5" }, "outputs": [], "source": [ "phoneme_to_finglish_map = {\n", " 'A': 'aa',\n", " 'S': 'Sh',\n", " 'Z': 'Zh',\n", " 'q': 'Gh',\n", " 'x': 'Kh',\n", " 'u': 'oo',\n", " '?': \"'\",\n", " 'C': 'Ch'\n", "}\n", "\n", "def replace_phonetic_characters(input_string, char_map):\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " return input_string.translate(translation_table)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "feybHtoF7SUq" }, "outputs": [], "source": [ "finglish_kaamel_dict = {}\n", "\n", "for k, vs in kaamel_dict.items():\n", " finglish_vs = []\n", " for v in vs:\n", " p = replace_phonetic_characters(v, phoneme_to_finglish_map)\n", " p = re.sub(\"([^\\w\\-\\?]|^)'\", r'\\1', p)\n", " finglish_vs.append(p)\n", "\n", " finglish_kaamel_dict[k] = finglish_vs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "99n9orsuMwqe" }, "outputs": [], "source": [ "inverted_finglish_kaamel_dict = {}\n", "\n", "for key, value_list in finglish_kaamel_dict.items():\n", " for value in value_list:\n", " inverted_finglish_kaamel_dict[value] = key" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "ujrYr29iy9TJ" }, "outputs": [], "source": [ "inverted_kaamel_dict = {}\n", "\n", "for key, value_list in kaamel_dict.items():\n", " for value in value_list:\n", " inverted_kaamel_dict[value] = key" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "OoaIwa8nOegN" }, "outputs": [], "source": [ "def word_in_dict(word, inverted_dictionary=inverted_finglish_kaamel_dict):\n", " return word in inverted_dictionary" ] }, { "cell_type": "markdown", "metadata": { "id": "2F06noXfhFoy" }, "source": [ "# Define post-processing" ] }, { "cell_type": "code", "source": [ "output_to_finglish_map = {\n", " 'м': 'm',\n", " 'ʷ': 'v',\n", " 'w': 'v',\n", " 'q': 'Gh',\n", " 'x': 'Kh',\n", " 'u': 'oo',\n", " '?': \"'\",\n", " 'ĉ': 'Ch',\n", " 'č': 'Ch',\n", " '̕': \"'\",\n", " 'ʔ': \"'\",\n", " 'ꞌ': \"'\",\n", " '̛': \"'\",\n", " '’': \"'\",\n", " 'ʼ': \"'\",\n", " 'ʿ': \"'\",\n", " '̓': '',\n", " 'â': 'aa',\n", " 'â': 'aa',\n", " 'ȃ': 'aa',\n", " 'c': 'k',\n", " 'ž': 'Zh',\n", " 'š': 'Sh',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " 'х': 'Kh',\n", " '`': \"'\",\n", " 'ɑ': 'aa',\n", " 'ɑ': 'aa',\n", " 'ʃ': 'Sh',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'Kh',\n", " 'ɣ': 'Gh',\n", " 'ʒ': 'Zh',\n", " ':': '',\n", " 'ā': 'aa',\n", " 'ː': '',\n", " 'ä': 'aa',\n", " 'á': 'aa',\n", " 'š': 'Sh',\n", " 'ū': 'oo',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'aa',\n", " 'ʰ': 'h',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'Kh',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ا': 'aa',\n", " 'ب': 'b',\n", " 'پ': 'p',\n", " 'ت': 't',\n", " 'ث': 's',\n", " 'ج': 'j',\n", " 'چ': 'Ch',\n", " 'ح': 'h',\n", " 'خ': 'Kh',\n", " 'د': 'd',\n", " 'ذ': 'z',\n", " 'ر': 'r',\n", " 'ز': 'z',\n", " 'ژ': 'Zh',\n", " 'س': 's',\n", " 'ش': 'Sh',\n", " 'ص': 's',\n", " 'ض': 'z',\n", " 'ط': 't',\n", " 'ظ': 'z',\n", " 'ع': \"'\",\n", " 'غ': 'Gh',\n", " 'ف': 'f',\n", " 'ق': 'Gh',\n", " 'ک': 'k',\n", " 'گ': 'g',\n", " 'ل': 'l',\n", " 'م': 'm',\n", " 'ن': 'n',\n", " 'و': 'v',\n", " 'ه': 'h',\n", " 'ی': 'y',\n", " 'ء': \"'\",\n", " 'ئ': \"'\",\n", " 'ؤ': \"o'\",\n", " 'آ': 'aa',\n", " 'أ': \"a'\",\n", " 'إ': \"e'\",\n", " 'ۀ': 'eye',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'oo',\n", " \"ˈ\": \"'\",\n", " 'ù': 'oo',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'oo',\n", " '_': ''\n", "}\n", "\n", "\n", "def replace_LLM_characters(input_string, char_map):\n", " substituted = re.sub(r'tʃʰ', 'ch', input_string)\n", " substituted = re.sub('tʃ', 'ch', substituted)\n", " substituted = re.sub(r't͡S', 'ch', substituted)\n", " substituted = re.sub(r'kʰ', 'k', substituted)\n", " substituted = re.sub(r'pʰ', 'p', substituted)\n", " substituted = re.sub(r'tʰ', 't', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " return translated" ], "metadata": { "id": "yU6OkQU1PZ_E" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "5gt9lEDhEp_d" }, "outputs": [], "source": [ "def get_finglish_consonants(word):\n", " char_map = {\n", " 'ا': '', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',\n", " 'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',\n", " 'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': \"'\",\n", " 'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',\n", " 'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': \"'\",'ئ': \"'\", 'ؤ': \"'\",\n", " 'آ': '', 'أ': \"'\", 'إ': \"'\", 'ۀ': 'y'\n", " }\n", " mapped_string = ''.join(char_map.get(char, char) for char in word)\n", " return mapped_string" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "FXWS_bCsHH9B" }, "outputs": [], "source": [ "def get_updated_span(match_span, displacements):\n", " new_start, new_end = match_span[0], match_span[1]\n", " for start, displacement in displacements:\n", " if start <= new_start:\n", " new_start += displacement\n", " new_end += displacement\n", "\n", " return (new_start, new_end)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "vDgHkWhBSZM_" }, "outputs": [], "source": [ "output_to_phonetics_map = {\n", " 'м': 'm',\n", " 'ʷ':' v',\n", " 'w': 'v',\n", " 'c': 'k',\n", " 'ĉ': 'C',\n", " 'č': 'C',\n", " '̕': \"?\",\n", " \"'\": '?',\n", " 'ʔ': \"?\",\n", " 'ꞌ': \"?\",\n", " '̛': \"?\",\n", " '’': \"?\",\n", " 'ʼ': \"?\",\n", " \"'\": '?',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " '`': \"?\",\n", " 'ɑ': 'A',\n", " 'ɑ': 'A',\n", " 'ʃ': 'S',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'x',\n", " 'ɣ': 'q',\n", " 'ʒ': 'Z',\n", " ':': '',\n", " 'ː': '',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'A',\n", " 'ʰ': '',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'x',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ا': 'A',\n", " 'ب': 'b',\n", " 'پ': 'p',\n", " 'ت': 't',\n", " 'ث': 's',\n", " 'ج': 'j',\n", " 'چ': 'C',\n", " 'ح': 'h',\n", " 'خ': 'x',\n", " 'د': 'd',\n", " 'ذ': 'z',\n", " 'ر': 'r',\n", " 'ز': 'z',\n", " 'ژ': 'Z',\n", " 'س': 's',\n", " 'ش': 'S',\n", " 'ص': 's',\n", " 'ض': 'z',\n", " 'ط': 't',\n", " 'ظ': 'z',\n", " 'ع': \"?\",\n", " 'غ': 'q',\n", " 'ف': 'f',\n", " 'ق': 'q',\n", " 'ک': 'k',\n", " 'گ': 'g',\n", " 'ل': 'l',\n", " 'م': 'm',\n", " 'ن': 'n',\n", " 'و': 'v',\n", " 'ه': 'h',\n", " 'ی': 'y',\n", " 'ء': \"?\",\n", " 'ئ': \"?\",\n", " 'ؤ': \"o?\",\n", " 'آ': 'A',\n", " 'أ': \"a?\",\n", " 'إ': \"e?\",\n", " 'ۀ': 'eye',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'u',\n", " \"ˈ\": '?',\n", " 'ù': 'u',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'u',\n", " '_': '',\n", " 'ç': 'C',\n", " 'ĝ': 'q',\n", " 'ɢ': 'q',\n", " 'ː': '',\n", " 'í': 'i',\n", " 'ŝ': 'S',\n", " '!': '',\n", " 'ǧ': 'q',\n", " 'ʻ': '?',\n", " 'è': 'e',\n", " '�': '',\n", " 'ú': 'u',\n", " 'ô': 'o',\n", " 'ē': 'e',\n", " 'à': 'A',\n", " 'ă': 'A',\n", " 'ǐ': 'i',\n", " 'ü': 'u',\n", " '\\u200e': '',\n", " 'ğ': 'q',\n", " 'ṣ': 'S',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 'S',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", "}\n", "\n", "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n", "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n", "\n", "\n", "def replace_LLM_phonetic_characters(input_string, char_map, from_phonetics=False):\n", " if not from_phonetics:\n", " try:\n", " input_string = re.sub(r'Sh', 'S', input_string)\n", " input_string = re.sub(r'Ch', 'C', input_string)\n", " input_string = re.sub(r'Zh', 'Z', input_string)\n", " input_string = re.sub(r'Gh', 'q', input_string)\n", " input_string = re.sub(r'Kh', 'x', input_string)\n", " except:\n", " print(input_string)\n", "\n", " substituted = re.sub(r'ch', 'C', input_string)\n", "\n", " substituted = re.sub(r'tʃʰ', 'C', substituted)\n", " substituted = re.sub(r'tʃ', 'C', substituted)\n", " substituted = re.sub(r't͡S', 'C', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " substituted = re.sub('ee', 'i', translated)\n", " substituted = re.sub('ii', 'i', substituted)\n", " substituted = re.sub('oo', 'u', substituted)\n", " substituted = re.sub('uu', 'u', substituted)\n", " substituted = re.sub('aa', 'A', substituted)\n", " substituted = re.sub('AA', 'A', substituted)\n", " substituted = re.sub('Aa', 'A', substituted)\n", " substituted = re.sub('aA', 'A', substituted)\n", " substituted = re.sub(rf'(?:\\b(sh)|(sh){consonants_regex}|(sh)\\b)', 'S', substituted)\n", " substituted = re.sub(rf'(?:\\b(kh)|(kh){consonants_regex}|(kh)\\b)', 'x', substituted)\n", " substituted = re.sub(rf'(?:\\b(zh)|(zh){consonants_regex}|(zh)\\b)', 'Z', substituted)\n", " substituted = re.sub(rf'(?:\\b(gh)|(gh){consonants_regex}|(gh)\\b)', 'q', substituted)\n", "\n", " substituted = re.sub(rf'([^\\w\\-\\?]|^){vowels_regex}', r'\\1?', substituted)\n", " substituted = substituted.replace('?output=[', '')\n", " substituted = substituted.replace('[?output=', '')\n", " substituted = substituted.replace('output=[', '')\n", " substituted = substituted.replace('[output=', '')\n", " substituted = substituted.replace('output=', '')\n", " substituted = substituted.replace('output', '')\n", " substituted = substituted.replace('[', '')\n", " substituted = substituted.replace(']', '')\n", " substituted = substituted.replace('=', '')\n", "\n", " substituted = re.sub(r'[^a-zA-Z\\?\\s]', '', substituted)\n", "\n", " return substituted" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "wYukaprhIOJf" }, "outputs": [], "source": [ "def fix_ambiguities(model_text, gt_text):\n", " # fix sh\n", " i = 0\n", " for c in gt_text:\n", " if c in 'سصث':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'sS': break\n", "\n", " if c == 'ش':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'S' and model_text[i + 1] != 'h': break\n", " if model_text[i] in 'Ss' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'S' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", " # fix zh\n", " i = 0\n", " for c in gt_text:\n", " if c in 'زذضظ':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'zZ': break\n", "\n", " if c == 'ژ':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'Z' and model_text[i + 1] != 'h': break\n", " if model_text[i] in 'zZ' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'Z' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", "\n", " # fix kh\n", " i = 0\n", " for c in gt_text:\n", " if c == 'ک':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'kK': break\n", "\n", " if c == 'خ':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] in 'xX': break\n", " if model_text[i] in 'kK' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'x' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", "\n", " # fix gh\n", " i = 0\n", " for c in gt_text:\n", " if c == 'گ':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'Gg': break\n", "\n", " if c in 'غق':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'q': break\n", " if model_text[i] in 'Gg' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'q' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", " return model_text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "LIUcYVuoBSgY" }, "outputs": [], "source": [ "def substitute_by_dict(model_text, gt_text):\n", " subwords = []\n", " matched_spans = set()\n", " for match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", model_text):\n", " match_text = match.group()\n", " match_span = match.span()\n", "\n", " finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)\n", " if finglish_text in inverted_finglish_kaamel_dict and inverted_finglish_kaamel_dict[finglish_text] in gt_text:\n", " max_sim, max_sim_p = -1, ''\n", " for p in kaamel_dict[inverted_finglish_kaamel_dict[finglish_text]]:\n", " phonetic_text = replace_LLM_phonetic_characters(finglish_text, output_to_phonetics_map)\n", " sim = SequenceMatcher(None, phonetic_text, p).ratio()\n", " if sim > max_sim:\n", " max_sim = sim\n", " max_sim_p = p\n", "\n", " gt_text = gt_text.replace(inverted_finglish_kaamel_dict[finglish_text], '')\n", " subwords.append((match_span, max_sim_p))\n", " matched_spans.add(match_span)\n", "\n", " for match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", model_text):\n", " match_text = match.group()\n", " match_span = match.span()\n", "\n", " if match_span in matched_spans: continue\n", " if not 'sh' in match_text and not 'kh' in match_text and not 'zh' in match_text and not 'Sh' in match_text and not 'Kh' in match_text and not 'Zh' in match_text: continue\n", "\n", " finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)\n", " consonant_finglish = re.sub(r'(е|e|i|u|o|a|ā|ä|â|ā|ɒ|á|A)', '', finglish_text)\n", "\n", " for gt_match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", gt_text):\n", " gt_match_text = gt_match.group()\n", " gt_match_span = gt_match.span()\n", "\n", " gt_consonant_finglish = get_finglish_consonants(gt_match_text)\n", " if SequenceMatcher(None, consonant_finglish, gt_consonant_finglish).ratio() > 0.65:\n", " subwords.append((match_span, fix_ambiguities(match_text, gt_match_text)))\n", "\n", " displacements = []\n", " for span, replacement in subwords:\n", " updates_span = get_updated_span(span, displacements)\n", " model_text = model_text[:updates_span[0]] + replacement + model_text[updates_span[1]:]\n", " displacements.append((updates_span[0], len(replacement) - (updates_span[1] - updates_span[0])))\n", "\n", " return model_text" ] }, { "cell_type": "code", "source": [ "def get_known_words(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):\n", " words = re.split('\\W+', graphemes)\n", " if multiple_choices:\n", " return '\\n'.join(f'{w}: {\", \".join(dictionary[w])}' for w in words if w in dictionary)\n", "\n", " return '\\n'.join(f'{w}: {\", \".join(dictionary[w])}' for w in words if w in dictionary and len(dictionary[w]) <= 1)" ], "metadata": { "id": "-g8h4TmnJW_t" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "def correct_output_by_llm_and_dict_info_finglish(grapheme, output, multi=True):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"A model was used to convert Persian sentences into Finglish (Persian written in the Latin alphabet). We have a dictionary with Finglish of some of the words. You are an assistant that corrects the Finglish output of the model choosing the right information from that dictionary. Be careful not to remove the connective Ezafe phonemes '-e' and '-ye' and show ع, ئ, and ٔ with '.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''Here is the original Persian sentence: [{grapheme}].\n", " Here is the Fingish output of the model: [{output}].\n", " Here is the Finglish to some words I found from dictionary:\n", " {get_known_words(grapheme, multiple_choices=multi, dictionary=finglish_kaamel_dict)}.\n", " Please return the corrected Finglish of the Persian sentence in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " return output" ], "metadata": { "id": "VAIfX2ZX6dm0" }, "execution_count": 20, "outputs": [] }, { "cell_type": "code", "source": [ "def replace_words_with_dict(text, dictionary=finglish_kaamel_dict):\n", " pattern = r'\\b\\w+\\b'\n", "\n", " modified_text = re.sub(pattern, lambda match: f'{dictionary[match.group()][0]}' if match.group() in dictionary and len(dictionary[match.group()]) == 1 else match.group(), text)\n", "\n", " return modified_text" ], "metadata": { "id": "Fln22mqv6_wa" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [ "def get_known_words_list(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):\n", " words = re.split('\\W+', graphemes)\n", " if multiple_choices:\n", " return [(w, dictionary[w]) for w in words if w in dictionary]\n", "\n", " return [(w, dictionary[w]) for w in words if w in dictionary and len(dictionary[w]) <= 1]" ], "metadata": { "id": "ebGHh6gS8lUs" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "def substitute_output_by_dict(grapheme, output, dictionary=finglish_kaamel_dict):\n", " ACCEPTED_THRESHOLD = 0.65\n", " output = re.sub(r'([^еeiuoaāäâāɒáA])(-i)', r'\\1i', output)\n", "\n", " alternatives = get_known_words_list(grapheme, dictionary=dictionary)\n", " output_words = re.split('[^-\\w\\?]+', output)\n", " pairs = []\n", "\n", " graphemes = []\n", " for grapheme, phonemes in alternatives:\n", " graphemes.append(grapheme)\n", "\n", " for j, phoneme in enumerate(phonemes):\n", " for i, word in enumerate(output_words):\n", " pairs.append((SequenceMatcher(None, phoneme, word).ratio(), phoneme, word, grapheme))\n", "\n", " sorted_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)\n", "\n", " for score, phoneme, output_word, grapheme in sorted_pairs:\n", " if score < ACCEPTED_THRESHOLD: break\n", " if grapheme not in graphemes: continue\n", " graphemes.remove(grapheme)\n", "\n", " if output_word.endswith('-e'):\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-e'}\\3\", output)\n", "\n", " elif output_word.endswith('-ye'):\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-ye'}\\3\", output)\n", "\n", " elif phoneme[-1] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and SequenceMatcher(None, phoneme, word[:-2]).ratio() > score:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-ye'}\\3\", output)\n", "\n", " elif phoneme[-1] not in 'еeiuoaāäâāɒáA' and output_word.endswith('e') and SequenceMatcher(None, phoneme, word[:-1]).ratio() > score:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-e'}\\3\", output)\n", "\n", " elif score > ACCEPTED_THRESHOLD:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme}\\3\", output)\n", "\n", " return output" ], "metadata": { "id": "RqDSYQ328nY7" }, "execution_count": 23, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "7-_ozqCtlEPi" }, "source": [ "# Prompt 1: Naive" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "uJLscT6YlEPk" }, "outputs": [], "source": [ "def prompt1(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are an assistant that converts Persian sentences into their IPA phonemes representation.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the phonemes of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " return output" ] }, { "cell_type": "markdown", "source": [ "# Prompt 2: In-Context Learning (ICL)" ], "metadata": { "id": "2zXtp8dxuDOV" } }, { "cell_type": "code", "source": [ "def prompt2(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their IPA phonemes representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"ɾæŋ-e ?ɒːbiː\", \"زندگی شیرین\": \"zendegiː-je ʃiːɾiːn\").\n", " 3. Use \"ʃ\" for 'ش', \"tʃʰ\" for 'چ', \"x\" for 'خ', \"q\" for 'ق'; \"ɣ\" for 'غ', \"ʒ\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: æ (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: ɒː (آ/ا), iː (ای), uː (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 'tʰ' for ت and ط; 'j' for ی; 'pʰ' for پ; 'kʰ' for ک; 'ɾ' for ر; 'ŋ' for نگ\n", " - Omit silent 'h' at the end of words (e.g., خانه → xɒːne, not xɒːneh)\n", " - Represent ع, ئ , and ء with an ʔ when it's pronounced\n", "\n", " Here are a few examples:\n", " input=[جریان شال چی بود؟], output=[dʒæɾjɒːn-e ʃɒːl tʃʰiː buːd]\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e now dæɾ ɣɒːɾ hæst jɒː xɒːne]\n", " input=[ژن زیبارویان پولدار], output=[ʒen-e ziːbɒːruːjɒːn-e puːldɒːɾ]\n", " input=[اتفاقی نمی‌افتد], output=[?ettefɒːqiː nemiː-?oftɒːd]\n", " input=[گرگ حیوانی وحشی است], output=[goɾg hejvɒːniː væhʃiː ?æst]\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the phonemes of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " return output\n" ], "metadata": { "id": "tIzIEZ8tuDOX" }, "execution_count": 25, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 3: Finglish" ], "metadata": { "id": "CRKJ6U6Hxaz3" } }, { "cell_type": "code", "source": [ "def prompt3(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "MJ-LmleDxaz5" }, "execution_count": 26, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 4: Rule-based Correction" ], "metadata": { "id": "wQTmDqqFLBdR" } }, { "cell_type": "code", "source": [ "def prompt4(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = substitute_output_by_dict(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output\n" ], "metadata": { "id": "hPD_d6rZLBdf" }, "execution_count": 27, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 5: LLM-based Correction" ], "metadata": { "id": "vl1JMrgkL8qc" } }, { "cell_type": "code", "source": [ "def prompt5(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "BsSy3b1SL8qm" }, "execution_count": 28, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 6: Dict Hints (1)" ], "metadata": { "id": "r-CZbQIkFQfS" } }, { "cell_type": "code", "source": [ "def prompt6(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, the Finglish of some of the words is given. Use the right pronunciations as help.\n", "\n", " ## Examples\n", " input=[جشن چهل مرد]. dict entries: 'مرد': mord, mard, 'جشن': jaShn, output=[jaShn-e Chehel mard].\n", " input=[گل نو در غار هست یا خانه؟]. dict entries: 'گل': gol, gel, 'در': dar, dorr, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد]. dict entries: . output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''This is the Persian sentence: [{grapheme}].\n", " These are the pronunciatin of some of the words I know:\n", " {get_known_words(grapheme, multiple_choices=True)}.\n", " Return Finglish of the Persian sentenc in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "IAu4lBBDFQff" }, "execution_count": 29, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 7: Dict Hints (2)" ], "metadata": { "id": "6UpTo6L_1bD8" } }, { "cell_type": "code", "source": [ "def prompt7(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, the Finglish of some of the words is given as help.\n", "\n", " ## Examples\n", " input=[جشن چهل مرد]. dict entries: 'چهل': Chehel, 'جشن': jaShn, output=[jaShn-e Chehel mard].\n", " input=[گل نو در غار هست یا خانه؟]. dict entries: 'غار': Ghaar, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد]. dict entries: . output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''This is the Persian sentence: [{grapheme}].\n", " These are the pronunciatin of some of the words I know:\n", " {get_known_words(grapheme, multiple_choices=False)}.\n", " Return Finglish of the Persian sentenc in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "3l-Do1xv1bEQ" }, "execution_count": 30, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 8: Dict Hints (3)" ], "metadata": { "id": "G2GAkjbQ61mX" } }, { "cell_type": "code", "source": [ "def prompt8(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.\n", "\n", " Here are a few examples of what I want:\n", " input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]\n", " input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\\n Complete the Finglish of it and return the result in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "9g_AsyX961mg" }, "execution_count": 31, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 9: Combined method" ], "metadata": { "id": "JKZDIkd3qTXq" } }, { "cell_type": "code", "source": [ "def prompt9(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.\n", "\n", " Here are a few examples of what I want:\n", " input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمی‌افتد], output=[ettefaaGhi nemi-oftad]\n", " input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\\n Complete the Finglish of it and return the result in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "jOvLHgE-qTX6" }, "execution_count": 32, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XjAPkfq7SF87" }, "source": [ "## Get Evaluation Data" ] }, { "cell_type": "markdown", "source": [ "# Get Evaluation Data" ], "metadata": { "id": "F6soKurLkkWu" } }, { "cell_type": "code", "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv" ], "metadata": { "id": "RbgMVxCsYf7M", "outputId": "8a6293f5-74f5-4eed-d6f0-59210c93f796", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-01-09 20:32:28-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n", "Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...\n", "Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 56029 (55K) [text/plain]\n", "Saving to: ‘SentenceBench.csv’\n", "\n", "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.72K --.-KB/s in 0.01s \n", "\n", "2025-01-09 20:32:28 (4.37 MB/s) - ‘SentenceBench.csv’ saved [56029/56029]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "sentence_bench = pd.read_csv('SentenceBench.csv', names=['dataset', 'grapheme', 'phoneme', 'homograph word',\t'pronunciation'])" ], "metadata": { "id": "qwCG0jX-88nQ" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "sentence_bench.head(3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "qlYbrnUa9LAN", "outputId": "7152d621-d207-4a4c-931f-454c54e82eb5" }, "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " dataset grapheme \\\n", "0 dataset grapheme \n", "1 polyphone من قدر تو را می‌دانم \n", "2 polyphone از قضای الهی به قدر الهی پناه می‌برم \n", "\n", " phoneme homograph word \\\n", "0 phoneme polyphone word \n", "1 man qadr-e to rA mi-dAnam قدر \n", "2 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n", "\n", " pronunciation \n", "0 pronunciation \n", "1 qadr \n", "2 qadar " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datasetgraphemephonemehomograph wordpronunciation
0datasetgraphemephonemepolyphone wordpronunciation
1polyphoneمن قدر تو را می‌دانمman qadr-e to rA mi-dAnamقدرqadr
2polyphoneاز قضای الهی به قدر الهی پناه می‌برم?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baramقدرqadar
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "sentence_bench", "summary": "{\n \"name\": \"sentence_bench\",\n \"rows\": 401,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"polyphone\",\n \"commonvoice\",\n \"dataset\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"grapheme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 401,\n \"samples\": [\n \"\\u0628\\u0631\\u0646\\u0627\\u0645\\u06c0 \\u0634\\u06a9\\u0646 \\u06a9\\u0647 \\u0645\\u06cc\\u200c\\u062a\\u0648\\u0627\\u0646\\u06cc\\u062f \\u0622\\u0646 \\u0631\\u0627 \\u0628\\u0631\\u0627\\u06cc \\u0633\\u06cc\\u0633\\u062a\\u0645\\u200c\\u0639\\u0627\\u0645\\u0644 \\u0627\\u0646\\u062f\\u0631\\u0648\\u06cc\\u062f \\u0627\\u0632 \\u0627\\u06cc\\u0646 \\u067e\\u06cc\\u0648\\u0646\\u062f \\u062f\\u0631\\u06cc\\u0627\\u0641\\u062a \\u06a9\\u0646\\u06cc\\u062f\\u060c \",\n \"\\u06a9\\u0647 \\u067e\\u06cc\\u0634 \\u0627\\u0632 \\u0627\\u0646\\u0642\\u0644\\u0627\\u0628 \\u0628\\u0647 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u062f\\u062e\\u062a\\u0631\\u0627\\u0646 \\u0648 \\u0632\\u0646\\u0627\\u0646 \\u0646\\u0627\\u0628\\u06cc\\u0646\\u0627 \\u0627\\u062e\\u062a\\u0635\\u0627\\u0635\\u200c\\u06cc\\u0627\\u0641\\u062a\\u0647 \\u0628\\u0648\\u062f. \\u0627\\u063a\\u0644\\u0628 \\u0632\\u0646\\u0627\\u0646\\u06cc \\u06a9\\u0647 \\u062f\\u0631 \\u0627\\u06cc\\u0646 \\u062e\\u0648\\u0627\\u0628\\u06af\\u0627\\u0647 \\u0632\\u0646\\u062f\\u06af\\u06cc \\u0645\\u06cc\\u200c\\u06a9\\u0631\\u062f\\u0646\\u062f\\u060c \",\n \"\\u062f\\u0631 \\u062f\\u0647 \\u067e\\u0627\\u06cc\\u06cc\\u0646 \\u0631\\u0648\\u0633\\u062a\\u0627\\u06cc\\u06cc\\u0627\\u0646 \\u0628\\u0627\\u0635\\u0641\\u0627\\u06cc\\u06cc \\u0647\\u0633\\u062a\\u0646\\u062f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"phoneme\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 401,\n \"samples\": [\n \"barnAmeye Sekan ke mitavAnid ?An rA barAye sistem?Amel-e ?androyd ?az ?in peyvand daryAft konid\",\n \"ke piS ?az ?enqelAb be xAbgAh-e doxtarAn va zanAn-e nAbinA ?extesAsyAfte bud ?aqlab-e zanAni ke dar ?in xAbgAh zendegi mikardand\",\n \"dar deh-e pAyin rustAyiyAne bA-safAyi hastand\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homograph word\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 102,\n \"samples\": [\n \"\\u062f\\u0631\\u062f\",\n \"\\u06a9\\u0645\\u06cc\",\n \"\\u0628\\u0631\\u0647\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pronunciation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 211,\n \"samples\": [\n \"derham\",\n \"Sum\",\n \"mahram\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "markdown", "metadata": { "id": "wDV7ysXf2b_H" }, "source": [ "### Get ManaTTS Data" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TcL5ZLvSSnVB", "outputId": "ae7a0d1b-f067-4ae1-eca0-39b8e1708e95" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('در این نوشته بنا داریم با یک ابزار ساده و مکانیکی افزایش بینایی برای افراد کم\\u200cبینا ',\n", " 'dar ?in neveSte banA dArim bA yek ?abzAr-e sAde va mekAniki-ye ?afzAyeS-e binAyi barAye ?afrAd-e kam\\u200cbinA ')]" ] }, "metadata": {}, "execution_count": 36 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'mana-tts'][['grapheme', 'phoneme']]\n", "\n", "# Convert to a list of tuples\n", "mana_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "mana_evaluation_data[:1]" ] }, { "cell_type": "markdown", "metadata": { "id": "Jjacw9Mp2eoX" }, "source": [ "### Get CommonVoice Data" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "id": "-yQnqCGw26sk", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "15febd0c-92a2-4224-ce0a-b2a8c6e2e116" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('در اکثر شهرها، مرکزی برای خرید دوچرخه وجود دارد.',\n", " 'dar ?aksar-e Sahr-hA, markazi barAye xarid-e doCarxe vojud dArad.')]" ] }, "metadata": {}, "execution_count": 37 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'commonvoice'][['grapheme', 'phoneme']]\n", "\n", "# Convert to a list of tuples\n", "commonvoice_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "commonvoice_evaluation_data[:1]" ] }, { "cell_type": "markdown", "metadata": { "id": "ciSPyhRc3Rvo" }, "source": [ "### Get Homograph Data" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "id": "XlFc5JbN3Rvz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "96dc0ca2-e7e7-4cbc-a22e-67380d722ae1" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('من قدر تو را می\\u200cدانم', 'man qadr-e to rA mi-dAnam', 'قدر', 'qadr')]" ] }, "metadata": {}, "execution_count": 38 } ], "source": [ "filtered_rows = sentence_bench[sentence_bench['dataset'] == 'polyphone'][['grapheme', 'phoneme', 'homograph word',\t'pronunciation']]\n", "\n", "# Convert to a list of tuples\n", "ambiguous_evaluation_data = list(filtered_rows.itertuples(index=False, name=None))\n", "\n", "ambiguous_evaluation_data[:1]" ] }, { "cell_type": "markdown", "source": [ "### Full Benchmark Data" ], "metadata": { "id": "MWMaXzOmlcPo" } }, { "cell_type": "code", "source": [ "benchmark = []\n", "\n", "for g, p in mana_evaluation_data:\n", " benchmark.append((g, p, '', ''))\n", "\n", "for g, p in commonvoice_evaluation_data:\n", " benchmark.append((g, p, '', ''))\n", "\n", "for g, p, w, r in ambiguous_evaluation_data:\n", " benchmark.append((g, p, w, r))" ], "metadata": { "id": "naGAAnA-ldLl" }, "execution_count": 39, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Inference" ], "metadata": { "id": "CGqrcAVA8QEI" } }, { "cell_type": "code", "source": [ "!mkdir llama3-70b-8192" ], "metadata": { "id": "nnlritmp8QEJ" }, "execution_count": 40, "outputs": [] }, { "cell_type": "code", "source": [ "with open('llama3-70b-8192/1.txt', 'w') as f:\n", " for g, p, w, r in tqdm(benchmark):\n", " output = prompt1(g)\n", " output = output.replace('\\n', '')\n", " print(output)\n", "\n", " f.write(f\"{output}\\n\")" ], "metadata": { "id": "GTE3REZpiBbq" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "R6PE5ds45TPr" }, "source": [ "# Define Evaluation Metrics" ] }, { "cell_type": "markdown", "metadata": { "id": "y73zFlRGIbt9" }, "source": [ "## PER Evaluation" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "id": "ItuviO3w5Vzv" }, "outputs": [], "source": [ "def remove_non_word_chars(text):\n", " pattern = r'[^\\w\\s\\?]'\n", " cleaned_text = re.sub(pattern, ' ', text)\n", " return cleaned_text" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "id": "syQCurXu51TO" }, "outputs": [], "source": [ "def remove_white_spaces(text):\n", " cleaned_text = re.sub(r'\\s+', ' ', text)\n", " return cleaned_text.strip()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "id": "V7APkVM053RP" }, "outputs": [], "source": [ "def get_word_only_text(text):\n", " word_only_text = remove_non_word_chars(text)\n", " extra_space_removed_text = remove_white_spaces(word_only_text)\n", "\n", " return extra_space_removed_text" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "ROomKSao57vy" }, "outputs": [], "source": [ "def get_texts_cer(reference, model_output):\n", " # Preprocess input texts to only contain word characters\n", " word_only_reference = get_word_only_text(reference)\n", " word_only_output = get_word_only_text(model_output)\n", "\n", " # Return +infinity for CER if any of the texts is empty\n", " if not word_only_reference.strip() or not word_only_output.strip():\n", " return float('inf')\n", "\n", " return cer(word_only_reference, word_only_output)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "id": "4vHLUjp48hc3" }, "outputs": [], "source": [ "def get_avg_cer_of_method(method_outputs, references):\n", " cers = []\n", " for idx, o in enumerate(method_outputs):\n", " cer = get_texts_cer(o, references[idx][1])\n", " if cer != float('inf'):\n", " cers.append(cer)\n", "\n", " return sum(cers) / len(cers)" ] }, { "cell_type": "markdown", "source": [ "## Ezafe Evaluation" ], "metadata": { "id": "f4NqCjr1FxVg" } }, { "cell_type": "code", "source": [ "def get_EZ_words_from_ground_truth(text):\n", " pattern = r'\\b(\\w+)(-e|-ye)\\b'\n", " matches = re.findall(pattern, text,)\n", "\n", " # Extract the words along with the suffix\n", " words_with_suffix = [match[0] + match[1] for match in matches]\n", " EZ_words = [tuple(re.split(r'(?=-)', w)) for w in words_with_suffix]\n", "\n", " return EZ_words" ], "metadata": { "id": "Fn1IjihOEDEF" }, "execution_count": 48, "outputs": [] }, { "cell_type": "code", "execution_count": 49, "metadata": { "id": "xiEKlZjV2OMC" }, "outputs": [], "source": [ "def get_EZ_words_from_phonetic_model_output(text):\n", " EZ_words = re.findall(r'\\b(\\w+)(-e|-ye)', text)\n", " EZ_word_candidates = []\n", "\n", " other_words = re.findall(r'\\b(\\w+)(?=(?:[^-\\w]|$))', text)\n", " for word in other_words:\n", " if len(word) >= 4 and word[-3] in 'еeiuoaāäâāɒáA' and word.endswith('ye') and word_in_dict(word[:-2], inverted_kaamel_dict) and not word_in_dict(word, inverted_kaamel_dict) and not word_in_dict(word[:-1], inverted_kaamel_dict):\n", " EZ_words.append((word[:-2], '-ye'))\n", " continue\n", "\n", " if len(word) >= 3 and word.endswith('e') and word_in_dict(word[:-1], inverted_kaamel_dict) and not word_in_dict(word, inverted_kaamel_dict):\n", " EZ_words.append((word[:-1], '-e'))\n", " continue\n", "\n", " if len(word) >= 4 and word[-3] in 'еeiuoaāäâāɒáA' and word.endswith('ye'):\n", " EZ_word_candidates.append((word[:-2], '-ye'))\n", " continue\n", "\n", " if len(word) >= 3 and word.endswith('e'):\n", " EZ_word_candidates.append((word[:-1], '-e'))\n", "\n", " return EZ_words, EZ_word_candidates" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "id": "PFkBeD262OMD" }, "outputs": [], "source": [ "def get_ezafe_TP_FP_TN_FN(gt_finglish, model_finglish):\n", " gt_word_count = len(re.findall(r'\\b\\w+(?:-\\w+)*\\b', gt_finglish))\n", " gt_EZ_words = get_EZ_words_from_ground_truth(gt_finglish)\n", "\n", " model_EZ_words, model_candidate_EZ_words = get_EZ_words_from_phonetic_model_output(model_finglish)\n", "\n", " TP = 0\n", " FP = 0\n", " TN = 0\n", " FN = 0\n", "\n", " gt_matched_indices = set()\n", " model_matched_indices = set()\n", " model_candidate_matched_indices = set()\n", "\n", " for gt_idx, (word, EZ) in enumerate(gt_EZ_words):\n", " for model_idx, (w, E) in enumerate(model_EZ_words):\n", " if model_idx not in model_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:\n", " TP += 1\n", " gt_matched_indices.add(gt_idx)\n", " model_matched_indices.add(model_idx)\n", " break\n", " else:\n", " for model_c_idx, (w, E) in enumerate(model_candidate_EZ_words):\n", " if model_c_idx not in model_candidate_matched_indices and SequenceMatcher(None, word, w).ratio() > 0.65:\n", " TP += 1\n", " gt_matched_indices.add(gt_idx)\n", " model_candidate_matched_indices.add(model_c_idx)\n", " break\n", "\n", " # Calculate FP: model_EZ_words that are not TP\n", " FP = len(model_EZ_words) - (TP - len(list(model_candidate_matched_indices)))\n", "\n", " # Calculate FN: gt_EZ_words that were not detected\n", " FN = len(gt_EZ_words) - TP\n", "\n", " # Calculate TN: non-Ezafe words that are correctly not detected as Ezafe\n", " TN = (gt_word_count - len(gt_EZ_words)) - FP\n", "\n", " return TP, FP, TN, FN\n" ] }, { "cell_type": "code", "source": [ "def get_ezafe_performance(outputs, references):\n", " total_TP, total_FP, total_TN, total_FN = 0, 0, 0, 0\n", "\n", " for idx, o in enumerate(outputs):\n", " TP, FP, TN, FN = get_ezafe_TP_FP_TN_FN(references[idx][1], o)\n", " total_TP += TP\n", " total_FP += FP\n", " total_TN += TN\n", " total_FN += FN\n", "\n", "\n", " total_model_EZ = total_TP + total_FP\n", " total_gt_EZ = total_TP + total_FN\n", "\n", " total_model_T = total_TP + total_TN\n", "\n", " total_gt_words = total_TP + total_TN + total_FP + total_FN\n", "\n", " accuracy = (total_model_T) / (total_gt_words) * 100\n", " precision = (total_TP) / (total_model_EZ) * 100\n", " recall = (total_TP) / (total_gt_EZ) * 100\n", "\n", " return accuracy, precision, recall" ], "metadata": { "id": "cbW4otNyIQLh" }, "execution_count": 51, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Homograph Evaluation" ], "metadata": { "id": "oBgNtpFQDwku" } }, { "cell_type": "code", "source": [ "def get_homograph_performance(outputs, references):\n", " corrects = 0\n", " total = 0\n", "\n", " for idx, (g, p, homograph, right) in enumerate(references):\n", " if homograph != '':\n", " total += 1\n", " if right in outputs[idx]:\n", " corrects += 1\n", "\n", " return corrects / total" ], "metadata": { "id": "J445ULEvEEDn" }, "execution_count": 52, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Evaluate Outputs" ], "metadata": { "id": "JGEUIrbi9kNH" } }, { "cell_type": "code", "source": [ "base_path = 'llama3-70b-8192'" ], "metadata": { "id": "5ySnzd_C98yz" }, "execution_count": 53, "outputs": [] }, { "cell_type": "code", "source": [ "def get_method_outputs(method_name):\n", " predictions = []\n", " with open(base_path + f'/{method_name}.txt', 'r') as f:\n", " predictions = [line for line in f.read().splitlines() if line.strip()]\n", " return predictions\n" ], "metadata": { "id": "wAKo1omGB8wc" }, "execution_count": 54, "outputs": [] }, { "cell_type": "code", "source": [ "def print_all_metrics(predictions):\n", " per = get_avg_cer_of_method(predictions, benchmark) * 100\n", " acc, prec, recall = get_ezafe_performance(predictions, benchmark)\n", " homograph = get_homograph_performance(predictions, benchmark) * 100\n", "\n", " print(f\"PER: \\t\\t\\t{per:.2f}\")\n", " print(f\"ACC, PREC, RECALL, F1: \\t{acc:.2f}, {prec:.2f}, {recall:.2f}, {((2 * prec * recall) / (prec + recall)):.2f}\")\n", " print(f\"Homograph: \\t\\t{homograph:.2f}\")\n" ], "metadata": { "id": "4jlXFt8tCPWB" }, "execution_count": 55, "outputs": [] }, { "cell_type": "code", "source": [ "# Evaluate prompt 1\n", "print_all_metrics(get_method_outputs('1'))" ], "metadata": { "id": "ksd3ybrUCH5g" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "collapsed_sections": [ "Zg2EzX4hOReJ", "AdU8VMTIOWLZ", "7-_ozqCtlEPi", "2zXtp8dxuDOV", "CRKJ6U6Hxaz3", "wQTmDqqFLBdR", "r-CZbQIkFQfS", "6UpTo6L_1bD8", "G2GAkjbQ61mX", "XjAPkfq7SF87", "wDV7ysXf2b_H", "Jjacw9Mp2eoX", "ciSPyhRc3Rvo", "y73zFlRGIbt9", "f4NqCjr1FxVg", "oBgNtpFQDwku" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }