{ "cells": [ { "cell_type": "markdown", "source": [ "# Setup Environment" ], "metadata": { "id": "ABgLYF9R8viP" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "I3VDTSYocPBc", "outputId": "71f5f740-0871-4bbf-9579-f7dacf9a33ef" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting hazm\n", " Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)\n", "Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)\n", " Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)\n", "Collecting flashtext<3.0,>=2.7 (from hazm)\n", " Downloading flashtext-2.7.tar.gz (14 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (4.3.3)\n", "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (3.9.1)\n", "Collecting numpy==1.24.3 (from hazm)\n", " Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", "Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)\n", " Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", "Requirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.10/dist-packages (from hazm) (1.6.0)\n", "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)\n", " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm) (75.1.0)\n", "Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (1.13.1)\n", "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (7.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (8.1.8)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (1.4.2)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (2024.11.6)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (4.67.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm) (3.5.0)\n", "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open>=1.8.1->gensim<5.0.0,>=4.3.1->hazm) (1.17.0)\n", "Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m59.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: flashtext\n", " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9298 sha256=00e71c3668bd8e587a801eba10ad77cd1062a93b11134d8064513370e729d01f\n", " Stored in directory: /root/.cache/pip/wheels/bc/be/39/c37ad168eb2ff644c9685f52554440372129450f0b8ed203dd\n", "Successfully built flashtext\n", "Installing collected packages: flashtext, python-crfsuite, pybind11, numpy, fasttext-wheel, hazm\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 1.26.4\n", " Uninstalling numpy-1.26.4:\n", " Successfully uninstalled numpy-1.26.4\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "albucore 0.0.19 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "albumentations 1.4.20 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.\n", "pymc 5.19.1 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 hazm-0.10.0 numpy-1.24.3 pybind11-2.13.6 python-crfsuite-0.9.11\n" ] }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "numpy" ] }, "id": "3a509826026a4a01ac258472d31eeec1" } }, "metadata": {} } ], "source": [ "!pip install hazm # Requires restart." ] }, { "cell_type": "code", "source": [ "!pip install groq\n", "!pip install jiwer" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mjTNnSAomVHS", "outputId": "e9737eea-df8e-474c-9725-b9923ba9f635" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting groq\n", " Downloading groq-0.14.0-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from groq) (3.7.1)\n", "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from groq) (1.9.0)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from groq) (0.28.1)\n", "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from groq) (2.10.4)\n", "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from groq) (1.3.1)\n", "Requirement already satisfied: typing-extensions<5,>=4.10 in /usr/local/lib/python3.10/dist-packages (from groq) (4.12.2)\n", "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->groq) (3.10)\n", "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->groq) (1.2.2)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->groq) (2024.12.14)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->groq) (1.0.7)\n", "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->groq) (0.14.0)\n", "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->groq) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->groq) (2.27.2)\n", "Downloading groq-0.14.0-py3-none-any.whl (109 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m109.5/109.5 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: groq\n", "Successfully installed groq-0.14.0\n", "Collecting jiwer\n", " Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)\n", "Requirement already satisfied: click<9.0.0,>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (8.1.8)\n", "Collecting rapidfuzz<4,>=3 (from jiwer)\n", " Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", "Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)\n", "Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: rapidfuzz, jiwer\n", "Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "from groq import Groq\n", "import re\n", "from difflib import SequenceMatcher\n", "from jiwer import cer\n", "from tqdm import tqdm" ], "metadata": { "id": "Cj9xpfNkkusV" }, "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Zg2EzX4hOReJ" }, "source": [ "# Setup LLM" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "HsnrlOZprsoK" }, "outputs": [], "source": [ "def get_response(messages):\n", " client = Groq(\n", " api_key= '', # Insert API key\n", " )\n", "\n", " while True:\n", " try:\n", " response = client.chat.completions.create(\n", " model='llama3-70b-8192',\n", " messages=messages,\n", " )\n", "\n", " response = response.choices[0].message.content\n", " return response\n", "\n", " except Exception as e:\n", " print(e)\n", " continue\n" ] }, { "cell_type": "markdown", "metadata": { "id": "AdU8VMTIOWLZ" }, "source": [ "# Get Dictionary" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xm_St_kWOYI-", "outputId": "6d973e40-faa8-4cf5-b52b-7a6af76aabe0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-01-09 20:32:04-- https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv\n", "Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...\n", "Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 7945406 (7.6M) [text/plain]\n", "Saving to: ‘KaamelDict.csv’\n", "\n", "KaamelDict.csv 100%[===================>] 7.58M 11.5MB/s in 0.7s \n", "\n", "2025-01-09 20:32:05 (11.5 MB/s) - ‘KaamelDict.csv’ saved [7945406/7945406]\n", "\n" ] } ], "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/KaamelDict/raw/main/KaamelDict.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "dGYh5bDyRfTg" }, "outputs": [], "source": [ "dict_path = \"KaamelDict.csv\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "WV2x_iLQRhHI" }, "outputs": [], "source": [ "dict_df = pd.read_csv(dict_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "OJTyOEoMR-cV" }, "outputs": [], "source": [ "kaamel_dict = {}\n", "\n", "for idx, row in dict_df.iterrows():\n", " g, p = row['grapheme'], ''.join(eval(row['phoneme']))\n", " if g not in kaamel_dict:\n", " kaamel_dict[g] = []\n", " kaamel_dict[g].append(p)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "sIUx62uakGV5" }, "outputs": [], "source": [ "phoneme_to_finglish_map = {\n", " 'A': 'aa',\n", " 'S': 'Sh',\n", " 'Z': 'Zh',\n", " 'q': 'Gh',\n", " 'x': 'Kh',\n", " 'u': 'oo',\n", " '?': \"'\",\n", " 'C': 'Ch'\n", "}\n", "\n", "def replace_phonetic_characters(input_string, char_map):\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " return input_string.translate(translation_table)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "feybHtoF7SUq" }, "outputs": [], "source": [ "finglish_kaamel_dict = {}\n", "\n", "for k, vs in kaamel_dict.items():\n", " finglish_vs = []\n", " for v in vs:\n", " p = replace_phonetic_characters(v, phoneme_to_finglish_map)\n", " p = re.sub(\"([^\\w\\-\\?]|^)'\", r'\\1', p)\n", " finglish_vs.append(p)\n", "\n", " finglish_kaamel_dict[k] = finglish_vs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "99n9orsuMwqe" }, "outputs": [], "source": [ "inverted_finglish_kaamel_dict = {}\n", "\n", "for key, value_list in finglish_kaamel_dict.items():\n", " for value in value_list:\n", " inverted_finglish_kaamel_dict[value] = key" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "ujrYr29iy9TJ" }, "outputs": [], "source": [ "inverted_kaamel_dict = {}\n", "\n", "for key, value_list in kaamel_dict.items():\n", " for value in value_list:\n", " inverted_kaamel_dict[value] = key" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "OoaIwa8nOegN" }, "outputs": [], "source": [ "def word_in_dict(word, inverted_dictionary=inverted_finglish_kaamel_dict):\n", " return word in inverted_dictionary" ] }, { "cell_type": "markdown", "metadata": { "id": "2F06noXfhFoy" }, "source": [ "# Define post-processing" ] }, { "cell_type": "code", "source": [ "output_to_finglish_map = {\n", " 'м': 'm',\n", " 'ʷ': 'v',\n", " 'w': 'v',\n", " 'q': 'Gh',\n", " 'x': 'Kh',\n", " 'u': 'oo',\n", " '?': \"'\",\n", " 'ĉ': 'Ch',\n", " 'č': 'Ch',\n", " '̕': \"'\",\n", " 'ʔ': \"'\",\n", " 'ꞌ': \"'\",\n", " '̛': \"'\",\n", " '’': \"'\",\n", " 'ʼ': \"'\",\n", " 'ʿ': \"'\",\n", " '̓': '',\n", " 'â': 'aa',\n", " 'â': 'aa',\n", " 'ȃ': 'aa',\n", " 'c': 'k',\n", " 'ž': 'Zh',\n", " 'š': 'Sh',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " 'х': 'Kh',\n", " '`': \"'\",\n", " 'ɑ': 'aa',\n", " 'ɑ': 'aa',\n", " 'ʃ': 'Sh',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'Kh',\n", " 'ɣ': 'Gh',\n", " 'ʒ': 'Zh',\n", " ':': '',\n", " 'ā': 'aa',\n", " 'ː': '',\n", " 'ä': 'aa',\n", " 'á': 'aa',\n", " 'š': 'Sh',\n", " 'ū': 'oo',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'aa',\n", " 'ʰ': 'h',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'Kh',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ا': 'aa',\n", " 'ب': 'b',\n", " 'پ': 'p',\n", " 'ت': 't',\n", " 'ث': 's',\n", " 'ج': 'j',\n", " 'چ': 'Ch',\n", " 'ح': 'h',\n", " 'خ': 'Kh',\n", " 'د': 'd',\n", " 'ذ': 'z',\n", " 'ر': 'r',\n", " 'ز': 'z',\n", " 'ژ': 'Zh',\n", " 'س': 's',\n", " 'ش': 'Sh',\n", " 'ص': 's',\n", " 'ض': 'z',\n", " 'ط': 't',\n", " 'ظ': 'z',\n", " 'ع': \"'\",\n", " 'غ': 'Gh',\n", " 'ف': 'f',\n", " 'ق': 'Gh',\n", " 'ک': 'k',\n", " 'گ': 'g',\n", " 'ل': 'l',\n", " 'م': 'm',\n", " 'ن': 'n',\n", " 'و': 'v',\n", " 'ه': 'h',\n", " 'ی': 'y',\n", " 'ء': \"'\",\n", " 'ئ': \"'\",\n", " 'ؤ': \"o'\",\n", " 'آ': 'aa',\n", " 'أ': \"a'\",\n", " 'إ': \"e'\",\n", " 'ۀ': 'eye',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'oo',\n", " \"ˈ\": \"'\",\n", " 'ù': 'oo',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'oo',\n", " '_': ''\n", "}\n", "\n", "\n", "def replace_LLM_characters(input_string, char_map):\n", " substituted = re.sub(r'tʃʰ', 'ch', input_string)\n", " substituted = re.sub('tʃ', 'ch', substituted)\n", " substituted = re.sub(r't͡S', 'ch', substituted)\n", " substituted = re.sub(r'kʰ', 'k', substituted)\n", " substituted = re.sub(r'pʰ', 'p', substituted)\n", " substituted = re.sub(r'tʰ', 't', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " return translated" ], "metadata": { "id": "yU6OkQU1PZ_E" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "5gt9lEDhEp_d" }, "outputs": [], "source": [ "def get_finglish_consonants(word):\n", " char_map = {\n", " 'ا': '', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',\n", " 'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',\n", " 'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': \"'\",\n", " 'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',\n", " 'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': \"'\",'ئ': \"'\", 'ؤ': \"'\",\n", " 'آ': '', 'أ': \"'\", 'إ': \"'\", 'ۀ': 'y'\n", " }\n", " mapped_string = ''.join(char_map.get(char, char) for char in word)\n", " return mapped_string" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "FXWS_bCsHH9B" }, "outputs": [], "source": [ "def get_updated_span(match_span, displacements):\n", " new_start, new_end = match_span[0], match_span[1]\n", " for start, displacement in displacements:\n", " if start <= new_start:\n", " new_start += displacement\n", " new_end += displacement\n", "\n", " return (new_start, new_end)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "vDgHkWhBSZM_" }, "outputs": [], "source": [ "output_to_phonetics_map = {\n", " 'м': 'm',\n", " 'ʷ':' v',\n", " 'w': 'v',\n", " 'c': 'k',\n", " 'ĉ': 'C',\n", " 'č': 'C',\n", " '̕': \"?\",\n", " \"'\": '?',\n", " 'ʔ': \"?\",\n", " 'ꞌ': \"?\",\n", " '̛': \"?\",\n", " '’': \"?\",\n", " 'ʼ': \"?\",\n", " \"'\": '?',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'W': 'v',\n", " 'β': 'f',\n", " 'е': 'e',\n", " '`': \"?\",\n", " 'ɑ': 'A',\n", " 'ɑ': 'A',\n", " 'ʃ': 'S',\n", " 'ð': 'z',\n", " 'ɾ': 'r',\n", " 'æ': 'a',\n", " 'ɪ': 'e',\n", " 'χ': 'x',\n", " 'ɣ': 'q',\n", " 'ʒ': 'Z',\n", " ':': '',\n", " 'ː': '',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 's',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", " 'ḥ': 'h',\n", " 'ɒ': 'A',\n", " 'ʰ': '',\n", " 'ə': 'e',\n", " 'R': 'r',\n", " 'W': 'v',\n", " 'Q': 'q',\n", " 'T': 't',\n", " 'Y': 'y',\n", " 'P': 'p',\n", " 'D': 'd',\n", " 'F': 'f',\n", " 'H': 'h',\n", " 'J': 'j',\n", " 'L': 'l',\n", " 'X': 'x',\n", " 'V': 'v',\n", " 'B': 'b',\n", " 'N': 'n',\n", " 'M': 'm',\n", " 'K': 'k',\n", " 'G': 'g',\n", " 'U': 'u',\n", " 'O': 'o',\n", " 'I': 'i',\n", " 'E': 'e',\n", " 'ا': 'A',\n", " 'ب': 'b',\n", " 'پ': 'p',\n", " 'ت': 't',\n", " 'ث': 's',\n", " 'ج': 'j',\n", " 'چ': 'C',\n", " 'ح': 'h',\n", " 'خ': 'x',\n", " 'د': 'd',\n", " 'ذ': 'z',\n", " 'ر': 'r',\n", " 'ز': 'z',\n", " 'ژ': 'Z',\n", " 'س': 's',\n", " 'ش': 'S',\n", " 'ص': 's',\n", " 'ض': 'z',\n", " 'ط': 't',\n", " 'ظ': 'z',\n", " 'ع': \"?\",\n", " 'غ': 'q',\n", " 'ف': 'f',\n", " 'ق': 'q',\n", " 'ک': 'k',\n", " 'گ': 'g',\n", " 'ل': 'l',\n", " 'م': 'm',\n", " 'ن': 'n',\n", " 'و': 'v',\n", " 'ه': 'h',\n", " 'ی': 'y',\n", " 'ء': \"?\",\n", " 'ئ': \"?\",\n", " 'ؤ': \"o?\",\n", " 'آ': 'A',\n", " 'أ': \"a?\",\n", " 'إ': \"e?\",\n", " 'ۀ': 'eye',\n", " 'ŋ': 'ng',\n", " '.': '',\n", " 'ɛ': 'e',\n", " 'ʊ': 'u',\n", " \"ˈ\": '?',\n", " 'ù': 'u',\n", " 'θ': 's',\n", " '̪': '',\n", " 'ũ': 'u',\n", " '_': '',\n", " 'ç': 'C',\n", " 'ĝ': 'q',\n", " 'ɢ': 'q',\n", " 'ː': '',\n", " 'í': 'i',\n", " 'ŝ': 'S',\n", " '!': '',\n", " 'ǧ': 'q',\n", " 'ʻ': '?',\n", " 'è': 'e',\n", " '�': '',\n", " 'ú': 'u',\n", " 'ô': 'o',\n", " 'ē': 'e',\n", " 'à': 'A',\n", " 'ă': 'A',\n", " 'ǐ': 'i',\n", " 'ü': 'u',\n", " '\\u200e': '',\n", " 'ğ': 'q',\n", " 'ṣ': 'S',\n", " 'â': 'A',\n", " 'â': 'A',\n", " 'ȃ': 'A',\n", " 'ž': 'Z',\n", " 'š': 'S',\n", " 'ā': 'A',\n", " 'ː': '',\n", " 'ä': 'A',\n", " 'á': 'A',\n", " 'š': 'S',\n", " 'ū': 'u',\n", " 'û': 'u',\n", " 'ś': 'S',\n", " 'ī': 'i',\n", " 'í': 'i',\n", " 'î': 'i',\n", " 'é': 'e',\n", "}\n", "\n", "consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'\n", "vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'\n", "\n", "\n", "def replace_LLM_phonetic_characters(input_string, char_map, from_phonetics=False):\n", " if not from_phonetics:\n", " try:\n", " input_string = re.sub(r'Sh', 'S', input_string)\n", " input_string = re.sub(r'Ch', 'C', input_string)\n", " input_string = re.sub(r'Zh', 'Z', input_string)\n", " input_string = re.sub(r'Gh', 'q', input_string)\n", " input_string = re.sub(r'Kh', 'x', input_string)\n", " except:\n", " print(input_string)\n", "\n", " substituted = re.sub(r'ch', 'C', input_string)\n", "\n", " substituted = re.sub(r'tʃʰ', 'C', substituted)\n", " substituted = re.sub(r'tʃ', 'C', substituted)\n", " substituted = re.sub(r't͡S', 'C', substituted)\n", " substituted = re.sub(r'ow', 'o', substituted)\n", " substituted = re.sub('dʒ', 'j', substituted)\n", "\n", " # Create a translation table using str.maketrans\n", " translation_table = str.maketrans(char_map)\n", "\n", " # Use str.translate to replace characters based on the translation table\n", " translated = substituted.translate(translation_table)\n", "\n", " substituted = re.sub('ee', 'i', translated)\n", " substituted = re.sub('ii', 'i', substituted)\n", " substituted = re.sub('oo', 'u', substituted)\n", " substituted = re.sub('uu', 'u', substituted)\n", " substituted = re.sub('aa', 'A', substituted)\n", " substituted = re.sub('AA', 'A', substituted)\n", " substituted = re.sub('Aa', 'A', substituted)\n", " substituted = re.sub('aA', 'A', substituted)\n", " substituted = re.sub(rf'(?:\\b(sh)|(sh){consonants_regex}|(sh)\\b)', 'S', substituted)\n", " substituted = re.sub(rf'(?:\\b(kh)|(kh){consonants_regex}|(kh)\\b)', 'x', substituted)\n", " substituted = re.sub(rf'(?:\\b(zh)|(zh){consonants_regex}|(zh)\\b)', 'Z', substituted)\n", " substituted = re.sub(rf'(?:\\b(gh)|(gh){consonants_regex}|(gh)\\b)', 'q', substituted)\n", "\n", " substituted = re.sub(rf'([^\\w\\-\\?]|^){vowels_regex}', r'\\1?', substituted)\n", " substituted = substituted.replace('?output=[', '')\n", " substituted = substituted.replace('[?output=', '')\n", " substituted = substituted.replace('output=[', '')\n", " substituted = substituted.replace('[output=', '')\n", " substituted = substituted.replace('output=', '')\n", " substituted = substituted.replace('output', '')\n", " substituted = substituted.replace('[', '')\n", " substituted = substituted.replace(']', '')\n", " substituted = substituted.replace('=', '')\n", "\n", " substituted = re.sub(r'[^a-zA-Z\\?\\s]', '', substituted)\n", "\n", " return substituted" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "wYukaprhIOJf" }, "outputs": [], "source": [ "def fix_ambiguities(model_text, gt_text):\n", " # fix sh\n", " i = 0\n", " for c in gt_text:\n", " if c in 'سصث':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'sS': break\n", "\n", " if c == 'ش':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'S' and model_text[i + 1] != 'h': break\n", " if model_text[i] in 'Ss' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'S' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", " # fix zh\n", " i = 0\n", " for c in gt_text:\n", " if c in 'زذضظ':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'zZ': break\n", "\n", " if c == 'ژ':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'Z' and model_text[i + 1] != 'h': break\n", " if model_text[i] in 'zZ' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'Z' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", "\n", " # fix kh\n", " i = 0\n", " for c in gt_text:\n", " if c == 'ک':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'kK': break\n", "\n", " if c == 'خ':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] in 'xX': break\n", " if model_text[i] in 'kK' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'x' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", "\n", " # fix gh\n", " i = 0\n", " for c in gt_text:\n", " if c == 'گ':\n", " while i < len(model_text) - 1:\n", " i += 1\n", " if model_text[i] in 'Gg': break\n", "\n", " if c in 'غق':\n", " while i < len(model_text) - 2:\n", " i += 1\n", " if model_text[i] == 'q': break\n", " if model_text[i] in 'Gg' and model_text[i + 1] == 'h':\n", " model_text = model_text[:i] + 'q' + model_text[i + 2:]\n", " break\n", "\n", " if i >= len(model_text) - 1: break\n", "\n", " return model_text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "LIUcYVuoBSgY" }, "outputs": [], "source": [ "def substitute_by_dict(model_text, gt_text):\n", " subwords = []\n", " matched_spans = set()\n", " for match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", model_text):\n", " match_text = match.group()\n", " match_span = match.span()\n", "\n", " finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)\n", " if finglish_text in inverted_finglish_kaamel_dict and inverted_finglish_kaamel_dict[finglish_text] in gt_text:\n", " max_sim, max_sim_p = -1, ''\n", " for p in kaamel_dict[inverted_finglish_kaamel_dict[finglish_text]]:\n", " phonetic_text = replace_LLM_phonetic_characters(finglish_text, output_to_phonetics_map)\n", " sim = SequenceMatcher(None, phonetic_text, p).ratio()\n", " if sim > max_sim:\n", " max_sim = sim\n", " max_sim_p = p\n", "\n", " gt_text = gt_text.replace(inverted_finglish_kaamel_dict[finglish_text], '')\n", " subwords.append((match_span, max_sim_p))\n", " matched_spans.add(match_span)\n", "\n", " for match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", model_text):\n", " match_text = match.group()\n", " match_span = match.span()\n", "\n", " if match_span in matched_spans: continue\n", " if not 'sh' in match_text and not 'kh' in match_text and not 'zh' in match_text and not 'Sh' in match_text and not 'Kh' in match_text and not 'Zh' in match_text: continue\n", "\n", " finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)\n", " consonant_finglish = re.sub(r'(е|e|i|u|o|a|ā|ä|â|ā|ɒ|á|A)', '', finglish_text)\n", "\n", " for gt_match in re.finditer(r\"(\\?|\\w|')+(?=[^\\?\\w']|$)\", gt_text):\n", " gt_match_text = gt_match.group()\n", " gt_match_span = gt_match.span()\n", "\n", " gt_consonant_finglish = get_finglish_consonants(gt_match_text)\n", " if SequenceMatcher(None, consonant_finglish, gt_consonant_finglish).ratio() > 0.65:\n", " subwords.append((match_span, fix_ambiguities(match_text, gt_match_text)))\n", "\n", " displacements = []\n", " for span, replacement in subwords:\n", " updates_span = get_updated_span(span, displacements)\n", " model_text = model_text[:updates_span[0]] + replacement + model_text[updates_span[1]:]\n", " displacements.append((updates_span[0], len(replacement) - (updates_span[1] - updates_span[0])))\n", "\n", " return model_text" ] }, { "cell_type": "code", "source": [ "def get_known_words(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):\n", " words = re.split('\\W+', graphemes)\n", " if multiple_choices:\n", " return '\\n'.join(f'{w}: {\", \".join(dictionary[w])}' for w in words if w in dictionary)\n", "\n", " return '\\n'.join(f'{w}: {\", \".join(dictionary[w])}' for w in words if w in dictionary and len(dictionary[w]) <= 1)" ], "metadata": { "id": "-g8h4TmnJW_t" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "def correct_output_by_llm_and_dict_info_finglish(grapheme, output, multi=True):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"A model was used to convert Persian sentences into Finglish (Persian written in the Latin alphabet). We have a dictionary with Finglish of some of the words. You are an assistant that corrects the Finglish output of the model choosing the right information from that dictionary. Be careful not to remove the connective Ezafe phonemes '-e' and '-ye' and show ع, ئ, and ٔ with '.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''Here is the original Persian sentence: [{grapheme}].\n", " Here is the Fingish output of the model: [{output}].\n", " Here is the Finglish to some words I found from dictionary:\n", " {get_known_words(grapheme, multiple_choices=multi, dictionary=finglish_kaamel_dict)}.\n", " Please return the corrected Finglish of the Persian sentence in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " return output" ], "metadata": { "id": "VAIfX2ZX6dm0" }, "execution_count": 20, "outputs": [] }, { "cell_type": "code", "source": [ "def replace_words_with_dict(text, dictionary=finglish_kaamel_dict):\n", " pattern = r'\\b\\w+\\b'\n", "\n", " modified_text = re.sub(pattern, lambda match: f'{dictionary[match.group()][0]}' if match.group() in dictionary and len(dictionary[match.group()]) == 1 else match.group(), text)\n", "\n", " return modified_text" ], "metadata": { "id": "Fln22mqv6_wa" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [ "def get_known_words_list(graphemes, multiple_choices=True, dictionary=finglish_kaamel_dict):\n", " words = re.split('\\W+', graphemes)\n", " if multiple_choices:\n", " return [(w, dictionary[w]) for w in words if w in dictionary]\n", "\n", " return [(w, dictionary[w]) for w in words if w in dictionary and len(dictionary[w]) <= 1]" ], "metadata": { "id": "ebGHh6gS8lUs" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "def substitute_output_by_dict(grapheme, output, dictionary=finglish_kaamel_dict):\n", " ACCEPTED_THRESHOLD = 0.65\n", " output = re.sub(r'([^еeiuoaāäâāɒáA])(-i)', r'\\1i', output)\n", "\n", " alternatives = get_known_words_list(grapheme, dictionary=dictionary)\n", " output_words = re.split('[^-\\w\\?]+', output)\n", " pairs = []\n", "\n", " graphemes = []\n", " for grapheme, phonemes in alternatives:\n", " graphemes.append(grapheme)\n", "\n", " for j, phoneme in enumerate(phonemes):\n", " for i, word in enumerate(output_words):\n", " pairs.append((SequenceMatcher(None, phoneme, word).ratio(), phoneme, word, grapheme))\n", "\n", " sorted_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)\n", "\n", " for score, phoneme, output_word, grapheme in sorted_pairs:\n", " if score < ACCEPTED_THRESHOLD: break\n", " if grapheme not in graphemes: continue\n", " graphemes.remove(grapheme)\n", "\n", " if output_word.endswith('-e'):\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-e'}\\3\", output)\n", "\n", " elif output_word.endswith('-ye'):\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-ye'}\\3\", output)\n", "\n", " elif phoneme[-1] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and SequenceMatcher(None, phoneme, word[:-2]).ratio() > score:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-ye'}\\3\", output)\n", "\n", " elif phoneme[-1] not in 'еeiuoaāäâāɒáA' and output_word.endswith('e') and SequenceMatcher(None, phoneme, word[:-1]).ratio() > score:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme + '-e'}\\3\", output)\n", "\n", " elif score > ACCEPTED_THRESHOLD:\n", " output = re.sub(rf'(\\W)({re.escape(output_word)})(\\W)', rf\"\\1{phoneme}\\3\", output)\n", "\n", " return output" ], "metadata": { "id": "RqDSYQ328nY7" }, "execution_count": 23, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "7-_ozqCtlEPi" }, "source": [ "# Prompt 1: Naive" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "uJLscT6YlEPk" }, "outputs": [], "source": [ "def prompt1(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are an assistant that converts Persian sentences into their IPA phonemes representation.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the phonemes of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " return output" ] }, { "cell_type": "markdown", "source": [ "# Prompt 2: In-Context Learning (ICL)" ], "metadata": { "id": "2zXtp8dxuDOV" } }, { "cell_type": "code", "source": [ "def prompt2(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their IPA phonemes representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"ɾæŋ-e ?ɒːbiː\", \"زندگی شیرین\": \"zendegiː-je ʃiːɾiːn\").\n", " 3. Use \"ʃ\" for 'ش', \"tʃʰ\" for 'چ', \"x\" for 'خ', \"q\" for 'ق'; \"ɣ\" for 'غ', \"ʒ\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: æ (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: ɒː (آ/ا), iː (ای), uː (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 'tʰ' for ت and ط; 'j' for ی; 'pʰ' for پ; 'kʰ' for ک; 'ɾ' for ر; 'ŋ' for نگ\n", " - Omit silent 'h' at the end of words (e.g., خانه → xɒːne, not xɒːneh)\n", " - Represent ع, ئ , and ء with an ʔ when it's pronounced\n", "\n", " Here are a few examples:\n", " input=[جریان شال چی بود؟], output=[dʒæɾjɒːn-e ʃɒːl tʃʰiː buːd]\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e now dæɾ ɣɒːɾ hæst jɒː xɒːne]\n", " input=[ژن زیبارویان پولدار], output=[ʒen-e ziːbɒːruːjɒːn-e puːldɒːɾ]\n", " input=[اتفاقی نمیافتد], output=[?ettefɒːqiː nemiː-?oftɒːd]\n", " input=[گرگ حیوانی وحشی است], output=[goɾg hejvɒːniː væhʃiː ?æst]\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the phonemes of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " return output\n" ], "metadata": { "id": "tIzIEZ8tuDOX" }, "execution_count": 25, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 3: Finglish" ], "metadata": { "id": "CRKJ6U6Hxaz3" } }, { "cell_type": "code", "source": [ "def prompt3(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "MJ-LmleDxaz5" }, "execution_count": 26, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 4: Rule-based Correction" ], "metadata": { "id": "wQTmDqqFLBdR" } }, { "cell_type": "code", "source": [ "def prompt4(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = substitute_output_by_dict(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output\n" ], "metadata": { "id": "hPD_d6rZLBdf" }, "execution_count": 27, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 5: LLM-based Correction" ], "metadata": { "id": "vl1JMrgkL8qc" } }, { "cell_type": "code", "source": [ "def prompt5(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " ## Examples\n", " input=[جریان شما چی بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل نو در غار هست یا خانه؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد], output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است], output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{grapheme}].\\n Return the Finglish of it in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "BsSy3b1SL8qm" }, "execution_count": 28, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 6: Dict Hints (1)" ], "metadata": { "id": "r-CZbQIkFQfS" } }, { "cell_type": "code", "source": [ "def prompt6(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, the Finglish of some of the words is given. Use the right pronunciations as help.\n", "\n", " ## Examples\n", " input=[جشن چهل مرد]. dict entries: 'مرد': mord, mard, 'جشن': jaShn, output=[jaShn-e Chehel mard].\n", " input=[گل نو در غار هست یا خانه؟]. dict entries: 'گل': gol, gel, 'در': dar, dorr, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد]. dict entries: . output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''This is the Persian sentence: [{grapheme}].\n", " These are the pronunciatin of some of the words I know:\n", " {get_known_words(grapheme, multiple_choices=True)}.\n", " Return Finglish of the Persian sentenc in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "IAu4lBBDFQff" }, "execution_count": 29, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 7: Dict Hints (2)" ], "metadata": { "id": "6UpTo6L_1bD8" } }, { "cell_type": "code", "source": [ "def prompt7(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, the Finglish of some of the words is given as help.\n", "\n", " ## Examples\n", " input=[جشن چهل مرد]. dict entries: 'چهل': Chehel, 'جشن': jaShn, output=[jaShn-e Chehel mard].\n", " input=[گل نو در غار هست یا خانه؟]. dict entries: 'غار': Ghaar, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد]. dict entries: . output=[ettefaaGhi nemi-oftad]\n", " input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f'''This is the Persian sentence: [{grapheme}].\n", " These are the pronunciatin of some of the words I know:\n", " {get_known_words(grapheme, multiple_choices=False)}.\n", " Return Finglish of the Persian sentenc in brackets like output=[].'''\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "3l-Do1xv1bEQ" }, "execution_count": 30, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 8: Dict Hints (3)" ], "metadata": { "id": "G2GAkjbQ61mX" } }, { "cell_type": "code", "source": [ "def prompt8(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.\n", "\n", " Here are a few examples of what I want:\n", " input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد], output=[ettefaaGhi nemi-oftad]\n", " input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\\n Complete the Finglish of it and return the result in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "9g_AsyX961mg" }, "execution_count": 31, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Prompt 9: Combined method" ], "metadata": { "id": "JKZDIkd3qTXq" } }, { "cell_type": "code", "source": [ "def prompt9(grapheme):\n", " matches = None\n", "\n", " while not matches:\n", " messages = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": '''You are an assistant that converts Persian sentences into their Finglish representation.\n", "\n", " ## Transliteration Guidelines\n", " 1. Accurately represent the pronunciation of Persian words.\n", " 2. Use hyphens to connect words with Ezafe when needed (e.g., \"رنگ آبی\": \"rang-e aabi\", \"زندگی شیرین\": \"zendegi-ye Shirin\").\n", " 3. Use \"Sh\" for 'ش', \"Ch\" for 'چ', \"Kh\" for 'خ', \"Gh\" for 'ق' and 'غ', \"Zh\" for 'ژ'.\n", "\n", " Additional guidelines:\n", " - Short vowels: a (ـَ), e (ـِ), o (ـُ)\n", " - Long vowels: aa (آ/ا), i (ای), oo (او)\n", " - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک\n", " - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)\n", " - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced\n", "\n", " In the inputs you will be given, some words are already replaced by their Finglish, I want you to complete it.\n", "\n", " Here are a few examples of what I want:\n", " input=[jaryaan شما Chi بود؟], output=[jaryaan-e Shomaa Chi bood].\n", " input=[گل no در Ghaar هست یا Khaane؟], output=[ɡol-e no dar Ghaar hast ya Khaane]\n", " input=[Zhen زیبارویان pooldaar], output=[Zhen-e zibaarooyaan-e pooldaar]\n", " input=[اتفاقی نمیافتد], output=[ettefaaGhi nemi-oftad]\n", " input=[gorg حیوانی vahShi است], output=[gorg heivaani vahShi ast].\n", " '''\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"This is the Persian sentence: [{replace_words_with_dict(grapheme)}].\\n Complete the Finglish of it and return the result in brackets like output=[].\"\n", " }]\n", "\n", " response = get_response(messages)\n", "\n", " matches = re.findall(r'\\[[^\\]]+\\]', response)\n", "\n", " if matches:\n", " output = matches[0].strip('[]')\n", " output = replace_LLM_characters(output, output_to_finglish_map)\n", " output = correct_output_by_llm_and_dict_info_finglish(grapheme, output)\n", " output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)\n", " output = substitute_by_dict(output, grapheme)\n", " return output" ], "metadata": { "id": "jOvLHgE-qTX6" }, "execution_count": 32, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XjAPkfq7SF87" }, "source": [ "## Get Evaluation Data" ] }, { "cell_type": "markdown", "source": [ "# Get Evaluation Data" ], "metadata": { "id": "F6soKurLkkWu" } }, { "cell_type": "code", "source": [ "!wget https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv" ], "metadata": { "id": "RbgMVxCsYf7M", "outputId": "8a6293f5-74f5-4eed-d6f0-59210c93f796", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2025-01-09 20:32:28-- https://huggingface.co/datasets/MahtaFetrat/SentenceBench/raw/main/SentenceBench.csv\n", "Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.6, 3.171.171.104, ...\n", "Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 56029 (55K) [text/plain]\n", "Saving to: ‘SentenceBench.csv’\n", "\n", "\rSentenceBench.csv 0%[ ] 0 --.-KB/s \rSentenceBench.csv 100%[===================>] 54.72K --.-KB/s in 0.01s \n", "\n", "2025-01-09 20:32:28 (4.37 MB/s) - ‘SentenceBench.csv’ saved [56029/56029]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "sentence_bench = pd.read_csv('SentenceBench.csv', names=['dataset', 'grapheme', 'phoneme', 'homograph word',\t'pronunciation'])" ], "metadata": { "id": "qwCG0jX-88nQ" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "sentence_bench.head(3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "qlYbrnUa9LAN", "outputId": "7152d621-d207-4a4c-931f-454c54e82eb5" }, "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " dataset grapheme \\\n", "0 dataset grapheme \n", "1 polyphone من قدر تو را میدانم \n", "2 polyphone از قضای الهی به قدر الهی پناه میبرم \n", "\n", " phoneme homograph word \\\n", "0 phoneme polyphone word \n", "1 man qadr-e to rA mi-dAnam قدر \n", "2 ?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram قدر \n", "\n", " pronunciation \n", "0 pronunciation \n", "1 qadr \n", "2 qadar " ], "text/html": [ "\n", "
\n", " | dataset | \n", "grapheme | \n", "phoneme | \n", "homograph word | \n", "pronunciation | \n", "
---|---|---|---|---|---|
0 | \n", "dataset | \n", "grapheme | \n", "phoneme | \n", "polyphone word | \n", "pronunciation | \n", "
1 | \n", "polyphone | \n", "من قدر تو را میدانم | \n", "man qadr-e to rA mi-dAnam | \n", "قدر | \n", "qadr | \n", "
2 | \n", "polyphone | \n", "از قضای الهی به قدر الهی پناه میبرم | \n", "?az qazAy ?elAhi be qadar-e ?elAhi panAh mi-baram | \n", "قدر | \n", "qadar | \n", "