# Setup Env

In [None]:
! pip install hazm    # Requires restart.

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [None]:
! pip install spacy-transformers
! pip install spacy
! pip install jiwer
! pip install groq
! pip install --upgrade fireworks-ai
! pip install python-Levenshtein
! pip install spacy-transformers
! pip install langchain_openai
# ! pip install together



In [None]:
import os
import re
from tqdm import tqdm
import csv
import pandas as pd
import json
import requests
import Levenshtein
import itertools
from groq import Groq
from difflib import SequenceMatcher
# from together import Together
from fireworks.client import Fireworks
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from jiwer import cer

# Setup POSTagger

In [None]:
! git clone https://huggingface.co/roshan-research/spacy_pos_tagger_parsbertpostagger

Cloning into 'spacy_pos_tagger_parsbertpostagger'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 21 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (21/21), 719.37 KiB | 1.29 MiB/s, done.


In [None]:
import spacy
"""این ماژول شامل کلاس‌ها و توابعی برای برچسب‌گذاری توکن‌هاست."""

from nltk.tag import stanford  # noqa: I001
from hazm import SequenceTagger

import os
import subprocess

from spacy.tokens import Doc
from spacy.tokens import DocBin
from spacy.vocab import Vocab

from sklearn.metrics import classification_report,f1_score,accuracy_score,precision_score,recall_score

from tqdm import tqdm


punctuation_list = [
    '"',
    "#",
    "(",
    ")",
    "*",
    ",",
    "-",
    ".",
    "/",
    ":",
    "[",
    "]",
    "«",
    "»",
    "،",
    ";",
    "?",
    "!",
]


class POSTagger(SequenceTagger):
    """این کلاس‌ها شامل توابعی برای برچسب‌گذاری توکن‌هاست."""

    def __init__(
        self: "POSTagger", model=None, data_maker=None, universal_tag=False,
    ) -> None:
        data_maker = self.data_maker if data_maker is None else data_maker
        self.__is_universal = universal_tag
        super().__init__(model, data_maker)

    def __universal_converter(self: "POSTagger", tagged_list):
        return [(word, tag.split(",")[0]) for word, tag in tagged_list]

    def __is_punc(self: "POSTagger", word):
        return word in punctuation_list

    def data_maker(self: "POSTagger", tokens):
        """تابعی که لیستی از لیستی از کلمات توکنایز شده را گرفته و لیست دو بعدی از از دیکشنری‌هایی که تعیین‌کننده ویژگی‌ها هر کلمه هستند را برمی‌گرداند.

        Examples:
            >>> posTagger = POSTagger(model = 'pos_tagger.model')
            >>> posTagger.data_maker(tokens = [['دلم', 'اینجا', 'مانده‌است', '.']])
            [[{'word': 'دلم', 'is_first': True, 'is_last': False, 'prefix-1': 'د', 'prefix-2': 'دل', 'prefix-3': 'دلم', 'suffix-1': 'م', 'suffix-2': 'لم', 'suffix-3': 'دلم', 'prev_word': '', 'two_prev_word': '', 'next_word': 'اینجا', 'two_next_word': 'مانده\u200cاست', 'is_numeric': False, 'prev_is_numeric': '', 'next_is_numeric': False, 'is_punc': False, 'prev_is_punc': '', 'next_is_punc': False}, {'word': 'اینجا', 'is_first': False, 'is_last': False, 'prefix-1': 'ا', 'prefix-2': 'ای', 'prefix-3': 'این', 'suffix-1': 'ا', 'suffix-2': 'جا', 'suffix-3': 'نجا', 'prev_word': 'دلم', 'two_prev_word': '.', 'next_word': 'مانده\u200cاست', 'two_next_word': '.', 'is_numeric': False, 'prev_is_numeric': False, 'next_is_numeric': False, 'is_punc': False, 'prev_is_punc': False, 'next_is_punc': False}, {'word': 'مانده\u200cاست', 'is_first': False, 'is_last': False, 'prefix-1': 'م', 'prefix-2': 'ما', 'prefix-3': 'مان', 'suffix-1': 'ت', 'suffix-2': 'ست', 'suffix-3': 'است', 'prev_word': 'اینجا', 'two_prev_word': 'دلم', 'next_word': '.', 'two_next_word': '', 'is_numeric': False, 'prev_is_numeric': False, 'next_is_numeric': False, 'is_punc': False, 'prev_is_punc': False, 'next_is_punc': True}, {'word': '.', 'is_first': False, 'is_last': True, 'prefix-1': '.', 'prefix-2': '.', 'prefix-3': '.', 'suffix-1': '.', 'suffix-2': '.', 'suffix-3': '.', 'prev_word': 'مانده\u200cاست', 'two_prev_word': 'اینجا', 'next_word': '', 'two_next_word': '', 'is_numeric': False, 'prev_is_numeric': False, 'next_is_numeric': '', 'is_punc': True, 'prev_is_punc': False, 'next_is_punc': ''}]]

        Args:
            tokens (List[List[str]]): جملاتی که نیاز به تبدیل آن به برداری از ویژگی‌ها است.

        Returns:
            List(List(Dict())): لیستی از لیستی از دیکشنری‌های بیان‌کننده ویژگی‌های یک کلمه.
        """
        return [
            [self.features(token, index) for index in range(len(token))]
            for token in tokens
        ]

    def features(self: "POSTagger", sentence, index):
        """features."""
        return {
            "word": sentence[index],
            "is_first": index == 0,
            "is_last": index == len(sentence) - 1,
            # *ix
            "prefix-1": sentence[index][0],
            "prefix-2": sentence[index][:2],
            "prefix-3": sentence[index][:3],
            "suffix-1": sentence[index][-1],
            "suffix-2": sentence[index][-2:],
            "suffix-3": sentence[index][-3:],
            # word
            "prev_word": "" if index == 0 else sentence[index - 1],
            "two_prev_word": "" if index == 0 else sentence[index - 2],
            "next_word": "" if index == len(sentence) - 1 else sentence[index + 1],
            "two_next_word": (
                ""
                if index in {len(sentence) - 1, len(sentence) - 2}
                else sentence[index + 2]
            ),
            # digit
            "is_numeric": sentence[index].isdigit(),
            "prev_is_numeric": "" if index == 0 else sentence[index - 1].isdigit(),
            "next_is_numeric": (
                "" if index == len(sentence) - 1 else sentence[index + 1].isdigit()
            ),
            # punc
            "is_punc": self.__is_punc(sentence[index]),
            "prev_is_punc": "" if index == 0 else self.__is_punc(sentence[index - 1]),
            "next_is_punc": (
                ""
                if index == len(sentence) - 1
                else self.__is_punc(sentence[index + 1])
            ),
        }

    def tag(self: "POSTagger", tokens):
        """یک جمله را در قالب لیستی از توکن‌ها دریافت می‌کند و در خروجی لیستی از
        `(توکن، برچسب)`ها برمی‌گرداند.

        Examples:
            >>> posTagger = POSTagger(model = 'pos_tagger.model')
            >>> posTagger.tag(tokens = ['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.'])
            [('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN,EZ'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]

            >>> posTagger = POSTagger(model = 'pos_tagger.model', universal_tag = True)
            >>> posTagger.tag(tokens = ['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.'])
            [('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]

        Args:
            tokens (List[str]): لیستی از توکن‌های یک جمله که باید برچسب‌گذاری شود.

        Returns:
            (List[Tuple[str,str]]): ‌لیستی از `(توکن، برچسب)`ها.

        """
        tagged_token = super().tag(tokens)
        return (
            self.__universal_converter(tagged_token)
            if self.__is_universal
            else tagged_token
        )

    def tag_sents(self: "POSTagger", sentences):
        """جملات را در قالب لیستی از توکن‌ها دریافت می‌کند
        و در خروجی، لیستی از لیستی از `(توکن، برچسب)`ها برمی‌گرداند.

        هر لیست از `(توکن، برچسب)`ها مربوط به یک جمله است.

        Examples:
            >>> posTagger = POSTagger(model = 'pos_tagger.model')
            >>> posTagger.tag_sents(sentences = [['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.']])
            [[('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN,EZ'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]]

            >>> posTagger = POSTagger(model = 'pos_tagger.model', universal_tag = True)
            >>> posTagger.tag_sents(sentences = [['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.']])
            [[('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]]

        Args:
            sentences (List[List[str]]): لیستی از جملات که باید برچسب‌گذاری شود.

        Returns:
            (List[List[Tuple[str,str]]]): لیستی از لیستی از `(توکن، برچسب)`ها.
                    هر لیست از `(توکن،برچسب)`ها مربوط به یک جمله است.

        """
        tagged_sents = super().tag_sents(sentences)
        return (
            [self.__universal_converter(tagged_sent) for tagged_sent in tagged_sents]
            if self.__is_universal
            else tagged_sents
        )


class StanfordPOSTagger(stanford.StanfordPOSTagger):
    """StanfordPOSTagger."""

    def __init__(
        self: "StanfordPOSTagger",
        model_filename: "str",
        path_to_jar: str,
        *args, # noqa: ANN002
        **kwargs, # noqa: ANN003
    ) -> None:
        self._SEPARATOR = "/"
        super(stanford.StanfordPOSTagger, self).__init__(
            model_filename=model_filename,
            path_to_jar=path_to_jar,
            *args,  # noqa: B026
            **kwargs,
        )

    def tag(self: "StanfordPOSTagger", tokens):
        """tag.

        Examples:
            >>> tagger = StanfordPOSTagger(model_filename='persian.tagger', path_to_jar='stanford_postagger.jar')
            >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.'])
            [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]

        """
        return self.tag_sents([tokens])[0]

    def tag_sents(self: "StanfordPOSTagger", sentences):
        """tag_sents."""
        refined = ([w.replace(" ", "_") for w in s] for s in sentences)
        return super(stanford.StanfordPOSTagger, self).tag_sents(refined)


class SpacyPOSTagger(POSTagger):
    def __init__(
        self: "SpacyPOSTagger",
        model_path=None,
        using_gpu=False,
        gpu_id=0
    ):
        """
        Initialize the SpacyPOSTagger with a model and data paths.

        Args:
        - model_path: Path to a pre-trained spaCy model.
        - test_dataset: Test dataset for evaluation. It has a similar structure to the training dataset.
        - test_directory: Directory to save the test data in spaCy format.
        - using_gpu: Set to True if you want use gpu (if you dont have one and set this to True the function use cpu automatically)
        This constructor calls the constructor of the parent class POSTagger.
        """
        super().__init__(universal_tag=True)
        self.model_path = model_path #### Usually an output directory for spacy model contain two other directory name model-last , model-best,You should give model_path like this : output/model-last
        self.using_gpu = using_gpu
        self.gpu_id = gpu_id
        self.tagger = None
        self._setup()

    def _setup(self: "SpacyPOSTagger"):
        """
        Set up the configuration for the spaCy model, including GPU settings and data preparation.

        This function initializes and configures the spaCy model, checks for GPU availability, and prepares the training and testing datasets in spaCy format.

        If using GPU, GPU settings are configured to enhance processing speed. Then, the pre-trained spaCy model is loaded based on the provided model path.

        Training and testing datasets are prepared and saved in the respective directories for use during model training and evaluation.
        """  # noqa: D212
        if self.using_gpu:
            self._setup_gpu()
        else:
            print("------------- You Prefer to use CPU --------------")


    def _setup_model(self: "SpacyPOSTagger",sents):
        """
        Initialize and configure the spaCy model for tagging without GPU settings.

        This method loads and configures the spaCy model based on the provided model path. It also sets up a custom tokenizer for text processing and constructs a dictionary for reference.

        Args:
            - model_path: Path to the pre-trained spaCy model.

        This method is typically called during setup to prepare the model for tagging tasks.
        """
        self.peykare_dict = {}  # Initialize a dictionary for reference
        self.tagger = spacy.load(self.model_path)  # Load the spaCy model
        self._set_peykare_dictionary(sents)  # Construct a reference dictionary
        self.tagger.tokenizer = self._custom_tokenizer  # Set a custom tokenizer for the model.

    def _setup_gpu(self: "SpacyPOSTagger"):
        """
        Check GPU availability and configure spaCy to use it if possible.

        This method checks whether a GPU is available and, if so, configures spaCy to utilize it for improved processing speed. It sets the 'gpu_availability' attribute to 'True' or 'False' accordingly.

        This check is performed during setup to make use of available GPU resources for enhanced performance.
        """
        print("------------------ GPU Setup Process Started ---------------------")
        gpu_available = spacy.prefer_gpu(gpu_id=self.gpu_id)  # Check if a GPU is available
        if gpu_available:
            print("------------ GPU is available and ready for use -------------")
            spacy.require_gpu(gpu_id=self.gpu_id)  # Configure spaCy to use the GPU
            self.gpu_availability = True
        else:
            print("------------ GPU is not available; spaCy will use CPU -------------")
            self.gpu_availability = False


    def _setup_dataset(self: "SpacyPOSTagger", dataset,saved_directory,data_type='train'):
        """
        Setup the training dataset in spaCy's binary format.

        This function prepares the training dataset and saves it in spaCy's binary format.
        """
        assert data_type in ['train','test']
        db = DocBin()
        for sent in tqdm(dataset):
            words = [word[0] for word in sent]
            tags = [word[1] for word in sent]
            doc = Doc(Vocab(strings=words), words=words)
            for d, tag in zip(doc, tags):
                d.tag_ = tag
            db.add(doc)

        self._handle_data_path(saved_directory)
        db.to_disk(f'{saved_directory}/{data_type}.spacy')

    def _handle_data_path(self,path='POSTaggerDataset'):
        """
        Create the directory if it doesn't exist.

        This method checks if the specified directory exists, and if not, it creates the directory to store the data.

        Args:
            - path: The path to the directory (default is 'POSTaggerDataset').

        This method is called to ensure the directory is available for saving processed data.
        """
        if not os.path.exists(path):
            os.makedirs(path)


    def _custom_tokenizer(self: "SpacyPOSTagger", text):
        """
        Implement a custom tokenization method for spaCy.

        This method defines a custom tokenization method for spaCy. It is used to tokenize input text based on a custom dictionary, or it raises an error if tokenization is not available.

        Args:
            - text: The input text to be tokenized.

        This custom tokenization method is used by the spaCy model during processing.

        """

        if text in self.peykare_dict:
            return Doc(self.tagger.vocab, self.peykare_dict[text])
        else:
            raise ValueError('No tokenization available for input.')

    def _set_peykare_dictionary(self: "SpacyPOSTagger", sents):
        """
        Create a dictionary for custom tokenization.

        This method constructs a dictionary to store custom tokenization mappings based on input sentences. It is used for custom tokenization in spaCy.

        Args:
            - sents: Input sentences to build the custom tokenization dictionary.

        This method is called during setup to establish a dictionary for tokenization.
        """
        self.peykare_dict = {' '.join([w for w in item]): [w for w in item] for item in sents}


    def _add_to_dict(self: "SpacyPOSTagger", sents):
        """
            Add the sentences to dictianory if it doesnt exist already
        """
        for sent in sents:
            key = ' '.join(sent)
            if key not in self.peykare_dict:
                self.peykare_dict[key] = sent


    def tag(self: "SpacyPOSTagger", tokens,universal_tag=True):
        """یک جمله را در قالب لیستی از توکن‌ها دریافت می‌کند و در خروجی لیستی از
        `(توکن، برچسب)`ها برمی‌گرداند.

        Examples:
            >>> posTagger = POSTagger(model = 'pos_tagger.model')
            >>> posTagger.tag(tokens = ['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.'])
            [('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN,EZ'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]

            >>> posTagger = POSTagger(model = 'pos_tagger.model', universal_tag = True)
            >>> posTagger.tag(tokens = ['من', 'به', 'مدرسه', 'ایران', 'رفته_بودم', '.'])
            [('من', 'PRON'), ('به', 'ADP'), ('مدرسه', 'NOUN'), ('ایران', 'NOUN'), ('رفته_بودم', 'VERB'), ('.', 'PUNCT')]

        Args:
            tokens (List[str]): لیستی از توکن‌های یک جمله که باید برچسب‌گذاری شود.

        Returns:
            (List[Tuple[str,str]]): ‌لیستی از `(توکن، برچسب)`ها.

        """
        if self.tagger == None:
            self._setup_model([tokens])
        self._add_to_dict([tokens])

        text = ' '.join(tokens)
        doc = self.tagger(text)
        if not universal_tag:
            tags = [tok.tag_ for tok in doc]
        else:
            tags = [tok.tag_.replace(',EZ','') for tok in doc]

        return list(zip(tokens,tags))
              # noqa: W293

    def tag_sents(self:"SpacyPOSTagger",sents,universal_tag=True,batch_size=128):
        """
            Args:
                sents : List[List[Tokens]]
                batch_size : number of batches give to model for processing sentences each time
        """
        """
            Returns : List[List[Tuple(str,str)]]
        """
        if self.tagger == None:
            self._setup_model(sents)

        self._add_to_dict(sents)

        docs = list(self.tagger.pipe((' '.join([w for w in sent]) for sent in sents), batch_size=batch_size))
        if not universal_tag:
            tags = [[w.tag_ for w in doc] for doc in docs]
        else:
            tags = [[w.tag_.replace(',EZ','') for w in doc] for doc in docs]

        combined_out = [list(zip(tok,tag)) for tok,tag in zip(sents,tags)]
        return combined_out

    def train(
        self: "SpacyPOSTagger",
        train_dataset,
        test_dataset,
        data_directory,
        base_config_file,
        train_config_path,
        output_dir,
        use_direct_config=False
    ):
        """
        Train the spaCy model using a subprocess and a configuration file.

        This method executes the training process for the spaCy model by invoking spaCy's training module using subprocess. It takes input configuration files, training and testing data, and GPU settings (if available).

        Args:
            - train_dataset: Training dataset for the tagger. It is a list of sentences, where each sentence is a list of token-tag pairs.
            - test_dataset: Testing dataset for the tagger. It is a list of sentences, where each sentence is a list of token-tag pairs.
            - data_directory: Directory to save the training and testing data in spaCy format.
            - base_config_file: Path to the base configuration file for spaCy.
            - train_config_path: if use_direct_config set to True this is the path of config file for training that you will use
              if use_direct_config set to False this is the path that you want train config file will create with base_config
            - output_dir: Directory for storing the trained model and training logs.

        Upon successful training, this method updates the model path to the trained model.

        This method is typically called to initiate the training process of the spaCy model.
        """

        self.spacy_train_directory = data_directory
        self.train_dataset = train_dataset ### List[List[Tuple]]
        self.test_dataset = test_dataset
        if self.train_dataset:
            # Set up the training dataset configuration
            self._setup_dataset(dataset=self.train_dataset, saved_directory=self.spacy_train_directory, data_type='train')

        if self.test_dataset:
            self._setup_dataset(test_dataset,saved_directory=data_directory,data_type='test')

        train_data = f'{data_directory}/train.spacy'
        test_data = f'{data_directory}/test.spacy'

        if use_direct_config == False:
            self._setup_train_config(base_config_file, train_config_path=train_config_path)
        else:
            self.train_config_file = train_config_path

        command = f"python -m spacy train {self.train_config_file} --output ./{output_dir} --paths.train ./{train_data} --paths.dev ./{test_data}"
        if self.gpu_availability:
            command += f" --gpu-id {self.gpu_id}"

        subprocess.run(command, shell=True)
        self.model_path = f"{output_dir}/model-last"
        self._setup_model([[w for w,_ in sent] for sent in test_dataset])

    def _setup_train_config(self: "SpacyPOSTagger", base_config, train_config_path):
        """
        Create and configure the training configuration file for spaCy.

        This method sets up the training configuration file by copying a base configuration file and customizing it according to the specified parameters.

        Args:
            - base_config: Path to the base configuration file.
            - train_config_file_name: Name of the training configuration file for saving it.

        This method is called to generate the training configuration file used in the training process.
        """
        self.train_config_file = train_config_path
        print("----------------- Setting up the training configuration file ----------------------")
        command = f"python -m spacy init fill-config {base_config} {train_config_path}"  # Generate the training configuration file
        subprocess.run(command, shell=True)
        print("----------------- Training configuration file created successfully ----------------------")
        print(f"----------------- Training Config file address is {train_config_path} --------------------")

    def evaluate(self: "SpacyPOSTagger", test_sents,batch_size):
        """
        Evaluate the spaCy model on input sentences using different tag options.

        This method evaluates the spaCy model on input sentences with and without 'EZ' tags and reports classification metrics.

        Args:
            - sents: List of sentences for evaluation.
            - batch_size : number of batches that model should process each time
        This method calls the internal evaluation method for both tag options.

        This method is typically used for model evaluation and reporting metrics.
        """
        self._setup_model([[w for w,_ in sent] for sent in test_sents])
        if self.tagger:
            golds, predictions = self._get_labels_and_predictions(test_sents,batch_size)
            print("-----------------------------------------")
            self._evaluate_tags(test_sents, golds, predictions, use_ez_tags=True,batch_size=batch_size)
            print("-----------------------------------------")
            self._evaluate_tags(test_sents, golds, predictions, use_ez_tags=False,batch_size=batch_size)
        else:
            raise ValueError("Model does not exist.Please train a new one with train method of this instance or give a model_path , setup the model with self._setup_model() and then call evaluate")

    def _evaluate_tags(self, sents, golds=None, predictions=None, use_ez_tags=True,batch_size=128):
        """
        Evaluate model predictions and report classification metrics.

        This method evaluates model predictions and reports classification metrics for the specified tag option.

        Args:
            - sents: List of sentences for evaluation.
            - golds: List of gold labels (optional).
            - predictions: List of model predictions (optional).
            - use_ez_tags: A flag indicating whether to consider 'EZ' tags.
            - batch_size : number of batches model should process

        If `golds` and `predictions` are not provided, they are automatically extracted from the input sentences.

        This method calculates and displays precision, recall, and F1-score for the specified tag option.

        This method is called by the `evaluate` method to perform model evaluation.
        """
        if golds is None or predictions is None:
            golds, predictions = self._get_labels_and_predictions(sents,batch_size)

        predictions_cleaned = []
        golds_cleaned = []
        if use_ez_tags:
            get_tag_func = self._get_ez_tags
        else:
            get_tag_func = self._remove_ez_tags

        for preds, golds in zip(predictions, golds):
            for pred in preds:
                pred_cleaned = get_tag_func(pred)
                predictions_cleaned.append(pred_cleaned)
            for gold in golds:
                gold_cleaned = get_tag_func(gold)
                golds_cleaned.append(gold_cleaned)

        print(classification_report(golds_cleaned, predictions_cleaned))
        print('Precision: %.5f' % precision_score(golds_cleaned, predictions_cleaned, average='weighted'))
        print('Recall: %.5f' % recall_score(golds_cleaned, predictions_cleaned, average='weighted'))
        print('F1-Score: %.5f' % f1_score(golds_cleaned, predictions_cleaned, average='macro'))

    def _get_ez_tags(self, label):
        """
        Extract 'EZ' tags from labels.

        This method extracts 'EZ' tags from labels if they are present and returns them.

        Args:
            - label: The label containing 'EZ' tags.

        Returns:
            The 'EZ' tags or '-' if 'EZ' tags are not present.
        """
        if 'EZ' in label:
            label = 'EZ'
        else:
            label = '-'

        return label

    def _remove_ez_tags(self, label):
        """
        Remove 'EZ' tags from labels.

        This method removes 'EZ' tags from labels if they are present and returns the cleaned label.

        Args:
            - label: The label containing 'EZ' tags.

        Returns:
            The label with 'EZ' tags removed.
        """
        return label.replace(',EZ', '') if 'EZ' in label else label

    def _evaluate_ez_tags(self, sents):
        """
        Evaluate model predictions with 'EZ' tags included.

        This method evaluates model predictions with 'EZ' tags included.
        """
        self._evaluate_tags(sents, use_ez_tags=True)

    def _evaluate_normal_tags(self, sents):
        """
        Evaluate model predictions without 'EZ' tags.

        This method evaluates model predictions without 'EZ' tags.
        """
        self._evaluate_tags(sents, use_ez_tags=False)

    def _get_labels_and_predictions(self: "SpacyPOSTagger", sents,batch_size):
        """
        Extract gold labels and model predictions for evaluation.

        This method extracts gold labels and model predictions from input sentences.

        Args:
            - sents: List of sentences for evaluation.

        Returns:
            Lists of gold labels and model predictions.

        This method is typically used for gathering data to perform model evaluation.
        """
        gold_labels = [[tag for _, tag in sent] for sent in sents]
        tokens = [[w for w,_ in sent] for sent in sents]
        prediction_labels = self.tag_sents(tokens,batch_size)
        return gold_labels, prediction_labels


In [None]:
spacy_posTagger = SpacyPOSTagger(model_path = '/content/spacy_pos_tagger_parsbertpostagger')

------------- You Prefer to use CPU --------------


# Setup LLM

In [None]:
def get_response(messages, model="meta-llama/llama-3.1-405b-instruct", server='openrouter'):
  if server == 'avalai':
    llm = ChatOpenAI(model=model, base_url="https://api.avalai.ir/v1", api_key="API-KEY")
    response = llm.invoke(messages)
    print("prompted!")

    return response.content

  if server == 'openrouter':
    response = requests.post(
      url="https://openrouter.ai/api/v1/chat/completions",
      headers={
        "Authorization": f"API-KEY",
      },
      data=json.dumps({
        "model": model,
        "messages": messages
      })
    )


    response = response.json()
    response = response['choices'][0]['message']['content']
    return response

  elif server == 'groq':
    client = Groq(
      api_key= "API-KEY",
    )

    while True:
      try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
        )

        response = response.choices[0].message.content
        return response

      except Exception as e:
        print(e)
        continue

  elif server == 'fireworks':
    client = Fireworks(api_key="API-KEY")

    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )

    response = response.choices[0].message.content
    return response

  elif server == 'together':
    client = Together(api_key="API-KEY")

    response = client.chat.completions.create(
      model=model,
      messages=messages,
    )
    response = response.choices[0].message.content
    return response


# Get Data

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Get Merged Dict

In [None]:
merged_dict_path = "/content/drive/MyDrive/merged_dict_new.csv"

In [None]:
merged_dict_df = pd.read_csv(merged_dict_path)

In [None]:
merged_dict_df.shape

In [None]:
consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'
vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'

In [None]:
merged_dict = {}

for idx, row in merged_dict_df.iterrows():
  g, p = row['grapheme'], ''.join(eval(row['phoneme']))
  if g not in merged_dict: merged_dict[g] = []
  p = re.sub(rf'([^\w\-\?]|^){vowels_regex}', r'\1?', p)
  merged_dict[g].append(p)

In [None]:
phoneme_to_finglish_map = {
  'A': 'aa',
  'S': 'Sh',
  'Z': 'Zh',
  'q': 'Gh',
  'x': 'Kh',
  'u': 'oo',
  '?': "'",
  'C': 'Ch'
}

def replace_phonetic_characters(input_string, char_map):
    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    return input_string.translate(translation_table)

In [None]:
re.sub("([^\w\-\?]|^)'", r'\1', "-'ali")

In [None]:
finglish_merged_dict = {}

for k, vs in merged_dict.items():
  finglish_vs = []
  for v in vs:
    p = replace_phonetic_characters(v, phoneme_to_finglish_map)
    p = re.sub("([^\w\-\?]|^)'", r'\1', p)
    finglish_vs.append(p)

  finglish_merged_dict[k] = finglish_vs

In [None]:
inverted_finglish_merged_dict = {}

for key, value_list in finglish_merged_dict.items():
    for value in value_list:
        inverted_finglish_merged_dict[value] = key

In [None]:
inverted_merged_dict = {}

for key, value_list in merged_dict.items():
    for value in value_list:
        inverted_merged_dict[value] = key

In [None]:
def word_in_dict(word, inverted_dictionary=inverted_finglish_merged_dict):
  return word in inverted_dictionary

# Define post-processing

In [None]:
output_to_finglish_map = {
    'м': 'm',
    'ʷ': 'v',
    'w': 'v',
    'q': 'Gh',
    'x': 'Kh',
    'u': 'oo',
    '?': "'",
    # 'ɔ': '?',
    'ĉ': 'Ch',
    'č': 'Ch',
    '̕': "'",
    # "'": '',
    'ʔ': "'",
    'ꞌ': "'",
    '̛':  "'",
    '’': "'",
    'ʼ': "'",
    'ʿ': "'",
    '̓': '',
    'â': 'aa',
    'â': 'aa',
    'ȃ': 'aa',
    'c': 'k',
    'ž': 'Zh',
    'š': 'Sh',
    'W': 'v',
    'β': 'f',
    'е': 'e',
    'х': 'Kh',
    '`': "'",
    'ɑ': 'aa',
    'ɑ': 'aa',
    'ʃ': 'Sh',
    'ð': 'z',
    'ɾ': 'r',
    'æ': 'a',
    'ɪ': 'e',
    'χ': 'Kh',
    'ɣ': 'Gh',
    'ʒ': 'Zh',
    ':': '',
    'ā': 'aa',
    'ː': '',
    'ä': 'aa',
    'á': 'aa',
    'š': 'Sh',
    'ū': 'oo',
    # 'û': 'oo', # Not sure
    'ś': 's',
    'ī': 'i',
    # 'í': 'i', # Not sure
    'î': 'i',
    'é': 'e', #
    'ḥ': 'h',
    'ɒ': 'aa',
    'ʰ': 'h',
    'ə': 'e',
    'R': 'r',
    'W': 'v',
    'Q': 'q',
    'T': 't',
    'Y': 'y',
    'P': 'p',
    'D': 'd',
    'F': 'f',
    'H': 'h',
    'J': 'j',
    'L': 'l',
    'X': 'Kh',
    'V': 'v',
    'B': 'b',
    'N': 'n',
    'M': 'm',
    'K': 'k',
    'G': 'g',
    'U': 'u',
    'O': 'o',
    'I': 'i',
    'E': 'e',
    'ا': 'aa',
    'ب': 'b',
    'پ': 'p',
    'ت': 't',
    'ث': 's',
    'ج': 'j',
    'چ': 'Ch',
    'ح': 'h',
    'خ': 'Kh',
    'د': 'd',
    'ذ': 'z',
    'ر': 'r',
    'ز': 'z',
    'ژ': 'Zh',
    'س': 's',
    'ش': 'Sh',
    'ص': 's',
    'ض': 'z',
    'ط': 't',
    'ظ': 'z',
    'ع': "'",
    'غ': 'Gh',
    'ف': 'f',
    'ق': 'Gh',
    'ک': 'k',
    'گ': 'g',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'v',
    'ه': 'h',
    'ی': 'y',
    'ء': "'",
    'ئ': "'",
    'ؤ': "o'",
    'آ': 'aa',
    'أ': "a'",
    'إ': "e'",
    'ۀ': 'eye',
    'ŋ': 'ng',
    '.': '',
    'ɛ': 'e',
    'ʊ': 'oo',
    "ˈ": "'",
    'ù': 'oo',
    'θ': 's',
    '̪': '',
    'ũ': 'oo',
    '_': ''
}

output_to_finglish_map_preserve_phonetics = {
    'м': 'm',
    'ʷ': 'v',
    'w': 'v',
    'x': 'x',
    # 'ɔ': '?',
    'ĉ': 'C',
    'č': 'C',
    '̕': "?",
    # "'": '',
    'ʔ': "?",
    'ꞌ': "?",
    '̛':  "?",
    '’': "?",
    'ʼ': "?",
    'ʿ': "?",
    'â': 'A',
    'â': 'A',
    'ȃ': 'A',
    'ž': 'Z',
    'š': 'S',
    'W': 'v',
    'β': 'f',
    'е': 'e',
    'х': 'x',
    '`': "?",
    'ɑ': 'A',
    'ɑ': 'A',
    'ʃ': 'S',
    'ð': 'z',
    'ɾ': 'r',
    'æ': 'a',
    'ɪ': 'e',
    'χ': 'x',
    'ɣ': 'q',
    'ʒ': 'Z',
    ':': '',
    'ā': 'A',
    'ː': '',
    'ä': 'A',
    'á': 'A',
    'š': 'S',
    'ū': 'u',
    # 'û': 'oo', # Not sure
    'ś': 's',
    'ī': 'i',
    # 'í': 'i', # Not sure
    'î': 'i',
    'é': 'e', #
    'ḥ': 'h',
    'ɒ': 'A',
    'ʰ': 'h',
    'ə': 'e',
    'R': 'r',
    'W': 'v',
    'Q': 'q',
    'T': 't',
    'Y': 'y',
    'P': 'p',
    'D': 'd',
    'F': 'f',
    'H': 'h',
    'J': 'j',
    'L': 'l',
    'X': 'x',
    'V': 'v',
    'B': 'b',
    'N': 'n',
    'M': 'm',
    'U': 'u',
    'O': 'o',
    'I': 'i',
    'E': 'e',
    'ا': 'aa',
    'ب': 'b',
    'پ': 'p',
    'ت': 't',
    'ث': 's',
    'ج': 'j',
    'چ': 'Ch',
    'ح': 'h',
    'خ': 'Kh',
    'د': 'd',
    'ذ': 'z',
    'ر': 'r',
    'ز': 'z',
    'ژ': 'Zh',
    'س': 's',
    'ش': 'Sh',
    'ص': 's',
    'ض': 'z',
    'ط': 't',
    'ظ': 'z',
    'ع': "'",
    'غ': 'Gh',
    'ف': 'f',
    'ق': 'Gh',
    'ک': 'k',
    'گ': 'g',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'v',
    'ه': 'h',
    'ی': 'y',
    'ء': "'",
    'ئ': "'",
    'ؤ': "o'",
    'آ': 'aa',
    'أ': "a'",
    'إ': "e'",
    'ۀ': 'eye',
    'ŋ': 'ng',
    '.': '',
    'ɛ': 'e',
    'ʊ': 'u',
    "ˈ": '?',
    'ù': 'u',
    'θ': 's',
    '̪': '',
    'ũ': 'u',
    '_': ''
}

def replace_LLM_characters(input_string, char_map):
    substituted = re.sub(r'tʃʰ', 'ch', input_string)
    substituted = re.sub('tʃ', 'ch', substituted)
    substituted = re.sub(r't͡S', 'ch', substituted)
    substituted = re.sub(r'kʰ', 'k', substituted)
    substituted = re.sub(r'pʰ', 'p', substituted)
    substituted = re.sub(r'tʰ', 't', substituted)
    substituted = re.sub(r'ow', 'o', substituted)
    substituted = re.sub('dʒ', 'j', substituted)

    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    translated = substituted.translate(translation_table)

    # ee -> to i or yi like binaee and jan-haee and tasheeli
    # ii as well. like in ashnaii

    # eh -> e like in boodjeh-ha

    # ow -> o
    # w -> v
    # but not always
    return translated

In [None]:
def get_finglish_consonants(word):
  char_map = {
      'ا': '', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',
      'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',
      'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': "'",
      'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
      'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': "'",'ئ': "'", 'ؤ': "'",
      'آ': '', 'أ': "'", 'إ': "'", 'ۀ': 'y'
  }
  mapped_string = ''.join(char_map.get(char, char) for char in word)
  return mapped_string

In [None]:
def get_updated_span(match_span, displacements):
  new_start, new_end = match_span[0], match_span[1]
  for start, displacement in displacements:
    if start <= new_start:
      new_start += displacement
      new_end += displacement

  return (new_start, new_end)

In [None]:
output_to_phonetics_map = {
    'м': 'm',
    'ʷ':' v',
    'w': 'v',
    # 'ɔ': '?',
    'c': 'k',
    'ĉ': 'C',
    'č': 'C',
    '̕': "?",
    "'": '?',
    'ʔ': "?",
    'ꞌ': "?",
    '̛':  "?",
    '’': "?",
    'ʼ': "?",
    "'": '?',
    'â': 'A',
    'â': 'A',
    'ȃ': 'A',
    'ž': 'Z',
    'š': 'S',
    'W': 'v',
    'β': 'f',
    'е': 'e',
    '`': "?",
    'ɑ': 'A',
    'ɑ': 'A',
    'ʃ': 'S',
    'ð': 'z',
    'ɾ': 'r',
    'æ': 'a',
    'ɪ': 'e',
    'χ': 'x',
    'ɣ': 'q',
    'ʒ': 'Z',
    ':': '',
    'ː': '',
    'ā': 'A',
    'ː': '',
    'ä': 'A',
    'á': 'A',
    'š': 'S',
    'ū': 'u',
    'û': 'u',
    'ś': 's',
    'ī': 'i',
    'í': 'i',
    'î': 'i',
    'é': 'e',
    'ḥ': 'h',
    'ɒ': 'A',
    'ʰ': '',
    'ə': 'e',
    'R': 'r',
    'W': 'v',
    'Q': 'q',
    'T': 't',
    'Y': 'y',
    'P': 'p',
    'D': 'd',
    'F': 'f',
    'H': 'h',
    'J': 'j',
    'L': 'l',
    'X': 'x',
    'V': 'v',
    'B': 'b',
    'N': 'n',
    'M': 'm',
    'K': 'k',
    'G': 'g',
    'U': 'u',
    'O': 'o',
    'I': 'i',
    'E': 'e',
    'ا': 'A',
    'ب': 'b',
    'پ': 'p',
    'ت': 't',
    'ث': 's',
    'ج': 'j',
    'چ': 'C',
    'ح': 'h',
    'خ': 'x',
    'د': 'd',
    'ذ': 'z',
    'ر': 'r',
    'ز': 'z',
    'ژ': 'Z',
    'س': 's',
    'ش': 'S',
    'ص': 's',
    'ض': 'z',
    'ط': 't',
    'ظ': 'z',
    'ع': "?",
    'غ': 'q',
    'ف': 'f',
    'ق': 'q',
    'ک': 'k',
    'گ': 'g',
    'ل': 'l',
    'م': 'm',
    'ن': 'n',
    'و': 'v',
    'ه': 'h',
    'ی': 'y',
    'ء': "?",
    'ئ': "?",
    'ؤ': "o?",
    'آ': 'A',
    'أ': "a?",
    'إ': "e?",
    'ۀ': 'eye',
    'ŋ': 'ng',
    '.': '',
    'ɛ': 'e',
    'ʊ': 'u',
    "ˈ": '?',
    'ù': 'u',
    'θ': 's',
    '̪': '',
    'ũ': 'u',
    '_': '',
    'ç': 'C',
    'ĝ': 'q',
    'ɢ': 'q',
    'ː': '',
    'í': 'i',
    'ŝ': 'S',
    '!': '',
    'ǧ': 'q',
    'ʻ': '?',
    'è': 'e',
    '�': '',
    'ú': 'u',
    'ô': 'o',
    'ē': 'e',
    'à': 'A',
    'ă': 'A',
    'ǐ': 'i',
    'ü': 'u',
    '\u200e': '',
    'ğ': 'q',
    'ṣ': 'S',
    'â': 'A',
    'â': 'A',
    'ȃ': 'A',
    'ž': 'Z',
    'š': 'S',
    'ā': 'A',
    'ː': '',
    'ä': 'A',
    'á': 'A',
    'š': 'S',
    'ū': 'u',
    'û': 'u',
    'ś': 'S',
    'ī': 'i',
    'í': 'i',
    'î': 'i',
    'é': 'e',
}

consonants_regex = '(?=' + '|'.join(['q', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'R', 'T', 'Y', 'P', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ]) + ')'
vowels_regex = '(?=' + '|'.join(['a', 'A', 'e', 'i', 'u', 'o']) + ')'


def replace_LLM_phonetic_characters(input_string, char_map, from_phonetics=False):
    if not from_phonetics:
      try:
        input_string = re.sub(r'Sh', 'S', input_string)
        input_string = re.sub(r'Ch', 'C', input_string)
        input_string = re.sub(r'Zh', 'Z', input_string)
        input_string = re.sub(r'Gh', 'q', input_string)
        input_string = re.sub(r'Kh', 'x', input_string)
      except:
        print(input_string)

    substituted = re.sub(r'ch', 'C', input_string)

    substituted = re.sub(r'tʃʰ', 'C', substituted)
    substituted = re.sub(r'tʃ', 'C', substituted)
    substituted = re.sub(r't͡S', 'C', substituted)
    substituted = re.sub(r'ow', 'o', substituted)
    substituted = re.sub('dʒ', 'j', substituted)

    # Create a translation table using str.maketrans
    translation_table = str.maketrans(char_map)

    # Use str.translate to replace characters based on the translation table
    translated = substituted.translate(translation_table)

    substituted = re.sub('ee', 'i', translated)
    substituted = re.sub('ii', 'i', substituted)
    substituted = re.sub('oo', 'u', substituted)
    substituted = re.sub('uu', 'u', substituted)
    substituted = re.sub('aa', 'A', substituted)
    substituted = re.sub('AA', 'A', substituted)
    substituted = re.sub('Aa', 'A', substituted)
    substituted = re.sub('aA', 'A', substituted)
    # substituted = re.sub(rf'(?:\b(ch)|(ch){consonants_regex}|(ch)\b)', 'C', substituted)
    # substituted = re.sub(rf'(?:\b(Ch)|(Ch){consonants_regex}|(Ch)\b)', 'C', substituted)
    substituted = re.sub(rf'(?:\b(sh)|(sh){consonants_regex}|(sh)\b)', 'S', substituted)
    # substituted = re.sub(rf'(?:\b(Sh)|(Sh){consonants_regex}|(Sh)\b)', 'S', substituted)
    substituted = re.sub(rf'(?:\b(kh)|(kh){consonants_regex}|(kh)\b)', 'x', substituted)
    # substituted = re.sub(rf'(?:\b(Kh)|(Kh){consonants_regex}|(Kh)\b)', 'x', substituted)
    substituted = re.sub(rf'(?:\b(zh)|(zh){consonants_regex}|(zh)\b)', 'Z', substituted)
    # substituted = re.sub(rf'(?:\b(Zh)|(Zh){consonants_regex}|(Zh)\b)', 'Z', substituted)
    substituted = re.sub(rf'(?:\b(gh)|(gh){consonants_regex}|(gh)\b)', 'q', substituted)
    # substituted = re.sub(rf'(?:\b(Gh)|(Gh){consonants_regex}|(Zh)\b)', 'Z', substituted)

    substituted = re.sub(rf'([^\w\-\?]|^){vowels_regex}', r'\1?', substituted)
    # ee -> to i or yi like binaee and jan-haee and tasheeli
    # ii as well. like in ashnaii

    # eh -> e like in boodjeh-ha

    # ow -> o
    # w -> v
    # but not always

    substituted = substituted.replace('?output=[', '')
    substituted = substituted.replace('[?output=', '')
    substituted = substituted.replace('output=[', '')
    substituted = substituted.replace('[output=', '')
    substituted = substituted.replace('output=', '')
    substituted = substituted.replace('output', '')
    substituted = substituted.replace('[', '')
    substituted = substituted.replace(']', '')
    substituted = substituted.replace('=', '')

    substituted = re.sub(r'[^a-zA-Z\?\s]', '', substituted)

    return substituted

In [None]:
def fix_ambiguities(model_text, gt_text):
  # fix sh
  i = 0
  for c in gt_text:
    if c in 'سصث':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'sS': break

    if c == 'ش':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'S' and model_text[i + 1] != 'h': break
        if model_text[i] in 'Ss' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'S' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break

  # fix zh
  i = 0
  for c in gt_text:
    if c in 'زذضظ':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'zZ': break

    if c == 'ژ':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'Z' and model_text[i + 1] != 'h': break
        if model_text[i] in 'zZ' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'Z' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break


  # fix kh
  i = 0
  for c in gt_text:
    if c == 'ک':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'kK': break

    if c == 'خ':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] in 'xX': break
        if model_text[i] in 'kK' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'x' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break


  # fix gh
  i = 0
  for c in gt_text:
    if c == 'گ':
      while i < len(model_text) - 1:
        i += 1
        if model_text[i] in 'Gg': break

    if c in 'غق':
      while i < len(model_text) - 2:
        i += 1
        if model_text[i] == 'q': break
        if model_text[i] in 'Gg' and model_text[i + 1] == 'h':
          model_text = model_text[:i] + 'q' + model_text[i + 2:]
          break

    if i >= len(model_text) - 1: break

  return model_text

In [None]:
def substitute_by_dict(model_text, gt_text):
  subwords = []
  matched_spans = set()
  for match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", model_text):
      match_text = match.group()
      match_span = match.span()

      finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)
      if finglish_text in inverted_finglish_merged_dict and inverted_finglish_merged_dict[finglish_text] in gt_text:
        max_sim, max_sim_p = -1, ''
        for p in merged_dict[inverted_finglish_merged_dict[finglish_text]]:
          phonetic_text = replace_LLM_phonetic_characters(finglish_text, output_to_phonetics_map)
          sim = SequenceMatcher(None, phonetic_text, p).ratio()
          if sim > max_sim:
            max_sim = sim
            max_sim_p = p

        gt_text = gt_text.replace(inverted_finglish_merged_dict[finglish_text], '')
        subwords.append((match_span, max_sim_p))
        matched_spans.add(match_span)

  for match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", model_text):
      match_text = match.group()
      match_span = match.span()

      if match_span in matched_spans: continue
      if not 'sh' in match_text and not 'kh' in match_text and not 'zh' in match_text and not 'Sh' in match_text and not 'Kh' in match_text and not 'Zh' in match_text: continue

      finglish_text = replace_LLM_characters(match_text, output_to_finglish_map)
      consonant_finglish = re.sub(r'(е|e|i|u|o|a|ā|ä|â|ā|ɒ|á|A)', '', finglish_text)

      for gt_match in re.finditer(r"(\?|\w|')+(?=[^\?\w']|$)", gt_text):
        gt_match_text = gt_match.group()
        gt_match_span = gt_match.span()

        gt_consonant_finglish = get_finglish_consonants(gt_match_text)
        if SequenceMatcher(None, consonant_finglish, gt_consonant_finglish).ratio() > 0.65:
            subwords.append((match_span, fix_ambiguities(match_text, gt_match_text)))

  displacements = []
  for span, replacement in subwords:
    updates_span = get_updated_span(span, displacements)
    model_text = model_text[:updates_span[0]] + replacement + model_text[updates_span[1]:]
    displacements.append((updates_span[0], len(replacement) - (updates_span[1] - updates_span[0])))

  return model_text

In [None]:
def get_known_words(graphemes, multiple_choices=True, dictionary=finglish_merged_dict):
  words = re.split('\W+', graphemes)
  if multiple_choices:
    return '\n'.join(f'{w}: {", ".join(dictionary[w])}' for w in words if w in dictionary)

  return '\n'.join(f'{w}: {", ".join(dictionary[w])}' for w in words if w in dictionary and len(dictionary[w]) <= 1)

In [None]:
def correct_output_by_llm_and_dict_info_finglish(grapheme, output, multi=True):
  matches = None

  input_words = 0
  output_words = 0

  while not matches:
    messages = [
            {
                "role": "system",
                "content": "A model was used to convert Persian sentences into Finglish (Persian written in the Latin alphabet). We have a dictionary with Finglish of some of the words. You are an assistant that corrects the Finglish output of the model choosing the right information from that dictionary. Be careful not to remove the connective Ezafe phonemes '-e' and '-ye' and show ع, ئ, and ٔ with '."
            },
            {
                "role": "user",
                "content": f'''Here is the original Persian sentence: [{grapheme}].
                Here is the Fingish output of the model: [{output}].
                Here is the Finglish to some words I found from dictionary:
                {get_known_words(grapheme, multiple_choices=multi, dictionary=finglish_merged_dict)}.
                Please return the corrected Finglish of the Persian sentence in brackets like output=[].'''
            }]

    response = get_response(messages, 'qwen/qwen-2-7b-instruct:free', 'openrouter')

    input_words += len(re.split('\w*', messages[0]['content'])) + len(re.split('\w*', messages[1]['content']))
    output_words += len(re.split('\w*', response))

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_characters(output, output_to_finglish_map)
      return output, input_words, output_words

In [None]:
def correct_output_by_llm_and_dict_info_phonetic(grapheme, output, multi=True):
  matches = None

  input_words = 0
  output_words = 0

  while not matches:
    messages = [
            {
                "role": "system",
                "content": "A model was used to convert Persian sentences into phonemes. We have a dictionary with phoneme of some of the words. You are an assistant that corrects the phoneme output of the model choosing the right information from that dictionary. Be careful not to remove the connective Ezafe phonemes '-e' and '-ye' and show ع, ئ, and ٔ with ?."
            },
            {
                "role": "user",
                "content": f'''Here is the original Persian sentence: [{grapheme}].
                Here is the phoneme output of the model: [{output}].
                Here is the phoneme to some words I found from dictionary:
                {get_known_words(grapheme, multiple_choices=multi, dictionary=merged_dict)}.
                Please return the corrected phoneme of the Persian sentence in brackets like output=[].'''
            }]

    response = get_response(messages, 'qwen/qwen-2-7b-instruct:free', 'openrouter')

    input_words += len(re.split('\w*', messages[0]['content'])) + len(re.split('\w*', messages[1]['content']))
    output_words += len(re.split('\w*', response))

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = substitute_by_dict(replace_LLM_phonetic_characters(output, output_to_phonetics_map), grapheme)
      return output, input_words, output_words

In [None]:
def replace_words_with_dict(text, dictionary=finglish_merged_dict):
    pattern = r'\b\w+\b'

    modified_text = re.sub(pattern, lambda match: f'{dictionary[match.group()][0]}' if match.group() in dictionary and len(dictionary[match.group()]) == 1 else match.group(), text)

    return modified_text

In [None]:
def get_known_words_list(graphemes, multiple_choices=True, dictionary=finglish_merged_dict):
  words = re.split('\W+', graphemes)
  if multiple_choices:
    return [(w, dictionary[w]) for w in words if w in dictionary]

  return [(w, dictionary[w]) for w in words if w in dictionary and len(dictionary[w]) <= 1]

In [None]:
def substitute_output_by_dict(grapheme, output, dictionary=finglish_merged_dict):
  ACCEPTED_THRESHOLD = 0.65
  output = re.sub(r'([^еeiuoaāäâāɒáA])(-i)', r'\1i', output)

  alternatives = get_known_words_list(grapheme, dictionary=dictionary)
  output_words = re.split('[^-\w\?]+', output)
  pairs = []

  graphemes = []
  for grapheme, phonemes in alternatives:
    graphemes.append(grapheme)
    # print(f"we are checking word {grapheme} with phonejmes {phonemes}")
    for j, phoneme in enumerate(phonemes):
      for i, word in enumerate(output_words):
        pairs.append((SequenceMatcher(None, phoneme, word).ratio(), phoneme, word, grapheme))

  sorted_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
  # print(sorted_pairs)

  for score, phoneme, output_word, grapheme in sorted_pairs:
    if score < ACCEPTED_THRESHOLD: break
    if grapheme not in graphemes: continue
    graphemes.remove(grapheme)

    if output_word.endswith('-e'):
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-e'}\3", output)

    elif output_word.endswith('-ye'):
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-ye'}\3", output)

    elif phoneme[-1] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and SequenceMatcher(None, phoneme, word[:-2]).ratio() > score:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-ye'}\3", output)

    elif phoneme[-1] not in 'еeiuoaāäâāɒáA' and output_word.endswith('e') and SequenceMatcher(None, phoneme, word[:-1]).ratio() > score:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme + '-e'}\3", output)

    elif score > ACCEPTED_THRESHOLD:
      output = re.sub(rf'(\W)({re.escape(output_word)})(\W)', rf"\1{phoneme}\3", output)

  return output

In [None]:
from hazm import WordTokenizer, Normalizer

In [None]:
tokenizer = WordTokenizer()
normalizer = Normalizer()

In [None]:
def get_last_subword(word):
    # Split the word by word boundaries
    subwords = re.findall(r'\b\w+\b', word)

    if len(subwords) > 1 and subwords[-1] in ['های', 'ی']:
      return (subwords[-2], subwords[-1])

    return (subwords[-1], '')

In [None]:
def get_EZ_tags(grapheme, compound=False):
  grapheme = re.sub('ۀ', 'ه‌ی', grapheme)
  tokens = tokenizer.tokenize(normalizer.normalize(grapheme))
  tags = spacy_posTagger.tag(tokens=tokens, universal_tag=False)
  tags = [(t[0], t[1], '') for t in tags if 'EZ' in t[1]]

  if compound:
    return tags

  splitted_tags = []
  for t in tags:
    subword1, subword2 = get_last_subword(t[0])
    splitted_tags.append((subword1, t[1], subword2))
  return splitted_tags

In [None]:
def get_naive_finglish(word, phonetic=False):
  if phonetic:
    char_map = {
    'ا': 'A', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'C',
    'ح': 'h', 'خ': 'x', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Z',
    'س': 's', 'ش': 'S', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': '?',
    'غ': 'q', 'ف': 'f', 'ق': 'q', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
    'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': '?','ئ': '?', 'ؤ': '?',
    'آ': 'A', 'أ': '?', 'إ': '?', 'ۀ': 'eye'
    }
  else:
    char_map = {
      'ا': 'aa', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',
      'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',
      'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': "'",
      'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
      'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': "'",'ئ': "'", 'ؤ': "o'",
      'آ': 'aa', 'أ': "a'", 'إ': "e'", 'ۀ': 'eye'
    }
  mapped_string = ''.join(char_map.get(char, char) for char in word)
  return mapped_string

In [None]:
def get_word_finglish_candidates(word, dictionary=finglish_merged_dict, phonetic=False):
  subwords = []
  for match in re.finditer(r'\b(\w+)\b', word):
      match_text = match.group()
      match_span = match.span()
      subwords.append((match_text, match_span))

  subword_candidates = []
  for subword, _ in subwords:
    if subword in dictionary:
      subword_candidates.append(dictionary[subword])
    else:
      subword_candidates.append([get_naive_finglish(subword, phonetic)])

  # Generate all possible combinations and concatenate
  word_candidates = [''.join(comb) for comb in itertools.product(*subword_candidates)]
  return word_candidates

In [None]:
def correct_finglish_model_EZ_by_tags(grapheme, model_output):
  EZ_tags = get_EZ_tags(grapheme, compound=False)

  matches = []
  matched_spans = set()

  for word, tag, ending in EZ_tags:
    finglish_candidates = get_word_finglish_candidates(word, finglish_merged_dict)
    # print(finglish_candidates)
    # if word == 'برای': continue
    for finglish in finglish_candidates:
      if finglish.endswith('aaye') and not word.endswith('ه'): continue
      for match in re.finditer(r'\b(\w+)\b', model_output):
        match_text = match.group()
        match_span = match.span()
        if match_text not in ['e', 'ye', 'i', 'ha', 'haa', 'haaye', 'haye'] and match_span not in matched_spans and SequenceMatcher(None, match_text, finglish).ratio() > 0.75:
          matches.append((match_span, match_text, finglish, ending))
          matched_spans.add(match_span)

  non_matches = []
  for match in re.finditer(r'\b(\w+)\b', model_output):
    match_text = match.group()
    match_span = match.span()
    if match_text not in ['e', 'ye', 'i', 'ha', 'haaye', 'haye'] and match_span not in matched_spans:
      non_matches.append((match_text, match_span))

  # print(matches)

  displacements = []
  for match_span, _, finglish, ending in matches:
    match_span = get_updated_span(match_span, displacements)
    if model_output[match_span[1]:].startswith('-e') or model_output[match_span[1]:].startswith('-ye') or model_output[match_span[1]:].startswith('-haaye') or model_output[match_span[1]:].startswith('-haye') or model_output[match_span[1]:].startswith('-haa-ye') or model_output[match_span[1]:].startswith('-ha-ye'):
      continue

    output_word = model_output[match_span[0]:match_span[1]]
    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒá' and output_word.endswith('ye') and \
     ((word_in_dict(output_word[:-2]) and not word_in_dict(output_word)) or \
     SequenceMatcher(None, output_word[:-2], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and \
     ((word_in_dict(output_word[:-1]) and not word_in_dict(output_word)) or \
     SequenceMatcher(None, output_word[:-1], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    if ending == 'ی' and len(output_word) >= 4 and output_word[-2:] == 'ye' and output_word[-3] == finglish[-1]:
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if ending == 'های' and len(output_word) >= 7 and output_word[-5:] == 'haaye':
      model_output = model_output[:match_span[1] - 5] + '-' + model_output[match_span[1] - 5:]
      displacements.append((match_span[1] - 5, 1))
      continue

    if ending == 'های' and len(output_word) >= 6 and output_word[-4:] == 'haye':
      model_output = model_output[:match_span[1] - 4] + '-haaye' + model_output[match_span[1]:]
      displacements.append((match_span[1] - 4, 2))
      continue

    if ending not in ['ی', 'های'] and len(output_word) >= 3 and output_word[-2] == finglish[-1] and output_word[-1] == 'e':
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    # print('output word', output_word)
    # print('finglish', finglish)
    if output_word[-1] in 'еeiuoaāäâāɒá' and not output_word.endswith('haye') and not output_word.endswith('haaye'):
      model_output = model_output[:match_span[1]] + '-ye' + model_output[match_span[1]:]
      displacements.append((match_span[1], 3))
      continue

    if not output_word.endswith('e'):
      model_output = model_output[:match_span[1]] + '-e' + model_output[match_span[1]:]
      displacements.append((match_span[1], 2))

  # print(model_output)
  # print(non_matches)

  for non_match, match_span in non_matches:
    match_span = get_updated_span(match_span, displacements)
    output_word = model_output[match_span[0]:match_span[1]]
    if re.match(r'^-e\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 2:]
      displacements.append((match_span[1] + 2, -2))
      continue

    if re.match(r'^-ye\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 3:]
      displacements.append((match_span[1] + 3, -3))
      continue

    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒá' and output_word.endswith('ye') and  (word_in_dict(output_word[:-2]) and not word_in_dict(output_word)):
      model_output = model_output[:match_span[1] - 2] + model_output[match_span[1]:]
      displacements.append((match_span[1], -2))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and (word_in_dict(output_word[:-1]) and not word_in_dict(output_word)):
      model_output = model_output[:match_span[1] - 1] + model_output[match_span[1]:]
      displacements.append((match_span[1], -1))

  return model_output

In [None]:
def correct_phonetic_model_EZ_by_tags(grapheme, model_output):
  EZ_tags = get_EZ_tags(grapheme, compound=False)

  matches = []
  matched_spans = set()

  for word, tag, ending in EZ_tags:
    finglish_candidates = get_word_finglish_candidates(word, merged_dict, phonetic=True)
    # print(finglish_candidates)
    # if word == 'برای': continue
    for finglish in finglish_candidates:
      if finglish.endswith('Aye') and not word.endswith('ه'): continue
      for match in re.finditer(r'\b(\w+)\b', model_output):
        match_text = match.group()
        match_span = match.span()
        if match_text not in ['e', 'ye', 'i', 'ha', 'hA', 'hAye', 'haye'] and match_span not in matched_spans and SequenceMatcher(None, match_text, finglish).ratio() > 0.75:
          matches.append((match_span, match_text, finglish, ending))
          matched_spans.add(match_span)

  non_matches = []
  for match in re.finditer(r'\b(\w+)\b', model_output):
    match_text = match.group()
    match_span = match.span()
    if match_text not in ['e', 'ye', 'i', 'ha', 'hAye', 'haye'] and match_span not in matched_spans:
      non_matches.append((match_text, match_span))

  # print(matches)

  displacements = []
  for match_span, _, finglish, ending in matches:
    match_span = get_updated_span(match_span, displacements)
    if model_output[match_span[1]:].startswith('-e') or model_output[match_span[1]:].startswith('-ye') or model_output[match_span[1]:].startswith('-hAye') or model_output[match_span[1]:].startswith('-haye') or model_output[match_span[1]:].startswith('-hA-ye') or model_output[match_span[1]:].startswith('-ha-ye'):
      continue

    output_word = model_output[match_span[0]:match_span[1]]
    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and \
     ((word_in_dict(output_word[:-2], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)) or \
     SequenceMatcher(None, output_word[:-2], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and \
     ((word_in_dict(output_word[:-1], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)) or \
     SequenceMatcher(None, output_word[:-1], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    if ending == 'ی' and len(output_word) >= 4 and output_word[-2:] == 'ye' and output_word[-3] == finglish[-1]:
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if ending == 'های' and len(output_word) >= 7 and output_word[-5:] == 'hAye':
      model_output = model_output[:match_span[1] - 4] + '-' + model_output[match_span[1] - 4:]
      displacements.append((match_span[1] - 5, 1))
      continue

    if ending == 'های' and len(output_word) >= 6 and output_word[-4:] == 'haye':
      model_output = model_output[:match_span[1] - 4] + '-hAye' + model_output[match_span[1]:]
      displacements.append((match_span[1] - 4, 1))
      continue

    if ending not in ['ی', 'های'] and len(output_word) >= 3 and output_word[-2] == finglish[-1] and output_word[-1] == 'e':
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    # print('output word', output_word)
    # print('finglish', finglish)
    if output_word[-1] in 'еeiuoaāäâāɒáA' and not output_word.endswith('haye') and not output_word.endswith('hAye'):
      model_output = model_output[:match_span[1]] + '-ye' + model_output[match_span[1]:]
      displacements.append((match_span[1], 3))
      continue

    if not output_word.endswith('e'):
      model_output = model_output[:match_span[1]] + '-e' + model_output[match_span[1]:]
      displacements.append((match_span[1], 2))

  # print(model_output)
  # print(non_matches)

  for non_match, match_span in non_matches:
    match_span = get_updated_span(match_span, displacements)
    output_word = model_output[match_span[0]:match_span[1]]
    if re.match(r'^-e\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 2:]
      displacements.append((match_span[1] + 2, -2))
      continue

    if re.match(r'^-ye\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 3:]
      displacements.append((match_span[1] + 3, -3))
      continue

    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and  (word_in_dict(output_word[:-2], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)):
      model_output = model_output[:match_span[1] - 2] + model_output[match_span[1]:]
      displacements.append((match_span[1], -2))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and (word_in_dict(output_word[:-1], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)):
      model_output = model_output[:match_span[1] - 1] + model_output[match_span[1]:]
      displacements.append((match_span[1], -1))

  return model_output

# Ezafe post-processing

In [None]:
from hazm import WordTokenizer, Normalizer

In [None]:
tokenizer = WordTokenizer()
normalizer = Normalizer()

In [None]:
def get_last_subword(word):
    # Split the word by word boundaries
    subwords = re.findall(r'\b\w+\b', word)

    if len(subwords) > 1 and subwords[-1] in ['های', 'ی']:
      return (subwords[-2], subwords[-1])

    return (subwords[-1], '')

In [None]:
def get_EZ_tags(grapheme, compound=False):
  grapheme = re.sub('ۀ', 'ه‌ی', grapheme)
  tokens = tokenizer.tokenize(normalizer.normalize(grapheme))
  tags = spacy_posTagger.tag(tokens=tokens, universal_tag=False)
  tags = [(t[0], t[1], '') for t in tags if 'EZ' in t[1]]

  if compound:
    return tags

  splitted_tags = []
  for t in tags:
    subword1, subword2 = get_last_subword(t[0])
    splitted_tags.append((subword1, t[1], subword2))
  return splitted_tags

In [None]:
def get_naive_finglish(word, phonetic=False):
  if phonetic:
    char_map = {
    'ا': 'A', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'C',
    'ح': 'h', 'خ': 'x', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Z',
    'س': 's', 'ش': 'S', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': '?',
    'غ': 'q', 'ف': 'f', 'ق': 'q', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
    'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': '?','ئ': '?', 'ؤ': '?',
    'آ': 'A', 'أ': '?', 'إ': '?', 'ۀ': 'eye'
    }
  else:
    char_map = {
      'ا': 'aa', 'ب': 'b', 'پ': 'p', 'ت': 't', 'ث': 's', 'ج': 'j', 'چ': 'Ch',
      'ح': 'h', 'خ': 'Kh', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'ژ': 'Zh',
      'س': 's', 'ش': 'Sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': "'",
      'غ': 'Gh', 'ف': 'f', 'ق': 'Gh', 'ک': 'k', 'گ': 'g', 'ل': 'l', 'م': 'm',
      'ن': 'n', 'و': 'v', 'ه': 'h', 'ی': 'y', 'ء': "'",'ئ': "'", 'ؤ': "o'",
      'آ': 'aa', 'أ': "a'", 'إ': "e'", 'ۀ': 'eye'
    }
  mapped_string = ''.join(char_map.get(char, char) for char in word)
  return mapped_string

In [None]:
def get_word_finglish_candidates(word, dictionary=finglish_merged_dict, phonetic=False):
  subwords = []
  for match in re.finditer(r'\b(\w+)\b', word):
      match_text = match.group()
      match_span = match.span()
      subwords.append((match_text, match_span))

  subword_candidates = []
  for subword, _ in subwords:
    if subword in dictionary:
      subword_candidates.append(dictionary[subword])
    else:
      subword_candidates.append([get_naive_finglish(subword, phonetic)])

  # Generate all possible combinations and concatenate
  word_candidates = [''.join(comb) for comb in itertools.product(*subword_candidates)]
  return word_candidates

In [None]:
def correct_finglish_model_EZ_by_tags(grapheme, model_output):
  EZ_tags = get_EZ_tags(grapheme, compound=False)

  matches = []
  matched_spans = set()

  for word, tag, ending in EZ_tags:
    finglish_candidates = get_word_finglish_candidates(word, finglish_merged_dict)
    # print(finglish_candidates)
    # if word == 'برای': continue
    for finglish in finglish_candidates:
      if finglish.endswith('aaye') and not word.endswith('ه'): continue
      for match in re.finditer(r'\b(\w+)\b', model_output):
        match_text = match.group()
        match_span = match.span()
        if match_text not in ['e', 'ye', 'i', 'ha', 'haa', 'haaye', 'haye'] and match_span not in matched_spans and SequenceMatcher(None, match_text, finglish).ratio() > 0.75:
          matches.append((match_span, match_text, finglish, ending))
          matched_spans.add(match_span)

  non_matches = []
  for match in re.finditer(r'\b(\w+)\b', model_output):
    match_text = match.group()
    match_span = match.span()
    if match_text not in ['e', 'ye', 'i', 'ha', 'haaye', 'haye'] and match_span not in matched_spans:
      non_matches.append((match_text, match_span))

  # print(matches)

  displacements = []
  for match_span, _, finglish, ending in matches:
    match_span = get_updated_span(match_span, displacements)
    if model_output[match_span[1]:].startswith('-e') or model_output[match_span[1]:].startswith('-ye') or model_output[match_span[1]:].startswith('-haaye') or model_output[match_span[1]:].startswith('-haye') or model_output[match_span[1]:].startswith('-haa-ye') or model_output[match_span[1]:].startswith('-ha-ye'):
      continue

    output_word = model_output[match_span[0]:match_span[1]]
    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒá' and output_word.endswith('ye') and \
     ((word_in_dict(output_word[:-2]) and not word_in_dict(output_word)) or \
     SequenceMatcher(None, output_word[:-2], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and \
     ((word_in_dict(output_word[:-1]) and not word_in_dict(output_word)) or \
     SequenceMatcher(None, output_word[:-1], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    if ending == 'ی' and len(output_word) >= 4 and output_word[-2:] == 'ye' and output_word[-3] == finglish[-1]:
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if ending == 'های' and len(output_word) >= 7 and output_word[-5:] == 'haaye':
      model_output = model_output[:match_span[1] - 5] + '-' + model_output[match_span[1] - 5:]
      displacements.append((match_span[1] - 5, 1))
      continue

    if ending == 'های' and len(output_word) >= 6 and output_word[-4:] == 'haye':
      model_output = model_output[:match_span[1] - 4] + '-haaye' + model_output[match_span[1]:]
      displacements.append((match_span[1] - 4, 2))
      continue

    if ending not in ['ی', 'های'] and len(output_word) >= 3 and output_word[-2] == finglish[-1] and output_word[-1] == 'e':
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    # print('output word', output_word)
    # print('finglish', finglish)
    if output_word[-1] in 'еeiuoaāäâāɒá' and not output_word.endswith('haye') and not output_word.endswith('haaye'):
      model_output = model_output[:match_span[1]] + '-ye' + model_output[match_span[1]:]
      displacements.append((match_span[1], 3))
      continue

    if not output_word.endswith('e'):
      model_output = model_output[:match_span[1]] + '-e' + model_output[match_span[1]:]
      displacements.append((match_span[1], 2))

  # print(model_output)
  # print(non_matches)

  for non_match, match_span in non_matches:
    match_span = get_updated_span(match_span, displacements)
    output_word = model_output[match_span[0]:match_span[1]]
    if re.match(r'^-e\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 2:]
      displacements.append((match_span[1] + 2, -2))
      continue

    if re.match(r'^-ye\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 3:]
      displacements.append((match_span[1] + 3, -3))
      continue

    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒá' and output_word.endswith('ye') and  (word_in_dict(output_word[:-2]) and not word_in_dict(output_word)):
      model_output = model_output[:match_span[1] - 2] + model_output[match_span[1]:]
      displacements.append((match_span[1], -2))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and (word_in_dict(output_word[:-1]) and not word_in_dict(output_word)):
      model_output = model_output[:match_span[1] - 1] + model_output[match_span[1]:]
      displacements.append((match_span[1], -1))

  return model_output

In [None]:
def correct_phonetic_model_EZ_by_tags(grapheme, model_output):
  EZ_tags = get_EZ_tags(grapheme, compound=False)

  matches = []
  matched_spans = set()

  for word, tag, ending in EZ_tags:
    finglish_candidates = get_word_finglish_candidates(word, merged_dict, True)
    # print(finglish_candidates)
    # if word == 'برای': continue
    for finglish in finglish_candidates:
      if finglish.endswith('Aye') and not word.endswith('ه'): continue
      for match in re.finditer(r'\b(\w+)\b', model_output):
        match_text = match.group()
        match_span = match.span()
        if match_text not in ['e', 'ye', 'i', 'ha', 'hA', 'hAye', 'haye'] and match_span not in matched_spans and SequenceMatcher(None, match_text, finglish).ratio() > 0.75:
          matches.append((match_span, match_text, finglish, ending))
          matched_spans.add(match_span)

  non_matches = []
  for match in re.finditer(r'\b(\w+)\b', model_output):
    match_text = match.group()
    match_span = match.span()
    if match_text not in ['e', 'ye', 'i', 'ha', 'hAye', 'haye'] and match_span not in matched_spans:
      non_matches.append((match_text, match_span))

  # print(matches)

  displacements = []
  for match_span, _, finglish, ending in matches:
    match_span = get_updated_span(match_span, displacements)
    if model_output[match_span[1]:].startswith('-e') or model_output[match_span[1]:].startswith('-ye') or model_output[match_span[1]:].startswith('-hAye') or model_output[match_span[1]:].startswith('-haye') or model_output[match_span[1]:].startswith('-hA-ye') or model_output[match_span[1]:].startswith('-ha-ye'):
      continue

    output_word = model_output[match_span[0]:match_span[1]]
    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and \
     ((word_in_dict(output_word[:-2], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)) or \
     SequenceMatcher(None, output_word[:-2], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and \
     ((word_in_dict(output_word[:-1], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)) or \
     SequenceMatcher(None, output_word[:-1], finglish).ratio() > SequenceMatcher(None, output_word, finglish).ratio()):
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    if ending == 'ی' and len(output_word) >= 4 and output_word[-2:] == 'ye' and output_word[-3] == finglish[-1]:
      model_output = model_output[:match_span[1] - 2] + '-' + model_output[match_span[1] - 2:]
      displacements.append((match_span[1] - 2, 1))
      continue

    if ending == 'های' and len(output_word) >= 7 and output_word[-5:] == 'hAye':
      model_output = model_output[:match_span[1] - 4] + '-' + model_output[match_span[1] - 4:]
      displacements.append((match_span[1] - 5, 1))
      continue

    if ending == 'های' and len(output_word) >= 6 and output_word[-4:] == 'haye':
      model_output = model_output[:match_span[1] - 4] + '-hAye' + model_output[match_span[1]:]
      displacements.append((match_span[1] - 4, 1))
      continue

    if ending not in ['ی', 'های'] and len(output_word) >= 3 and output_word[-2] == finglish[-1] and output_word[-1] == 'e':
      model_output = model_output[:match_span[1] - 1] + '-' + model_output[match_span[1] - 1:]
      displacements.append((match_span[1] - 1, 1))
      continue

    # print('output word', output_word)
    # print('finglish', finglish)
    if output_word[-1] in 'еeiuoaāäâāɒáA' and not output_word.endswith('haye') and not output_word.endswith('hAye'):
      model_output = model_output[:match_span[1]] + '-ye' + model_output[match_span[1]:]
      displacements.append((match_span[1], 3))
      continue

    if not output_word.endswith('e'):
      model_output = model_output[:match_span[1]] + '-e' + model_output[match_span[1]:]
      displacements.append((match_span[1], 2))

  # print(model_output)
  # print(non_matches)

  for non_match, match_span in non_matches:
    match_span = get_updated_span(match_span, displacements)
    output_word = model_output[match_span[0]:match_span[1]]
    if re.match(r'^-e\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 2:]
      displacements.append((match_span[1] + 2, -2))
      continue

    if re.match(r'^-ye\b', model_output[match_span[1]:]):
      model_output = model_output[:match_span[1]] + model_output[match_span[1] + 3:]
      displacements.append((match_span[1] + 3, -3))
      continue

    if len(output_word) >= 4 and output_word[-3] in 'еeiuoaāäâāɒáA' and output_word.endswith('ye') and  (word_in_dict(output_word[:-2], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)):
      model_output = model_output[:match_span[1] - 2] + model_output[match_span[1]:]
      displacements.append((match_span[1], -2))
      continue

    if len(output_word) >= 3 and output_word.endswith('e') and (word_in_dict(output_word[:-1], inverted_merged_dict) and not word_in_dict(output_word, inverted_merged_dict)):
      model_output = model_output[:match_span[1] - 1] + model_output[match_span[1]:]
      displacements.append((match_span[1], -1))

  return model_output

# Prompt 12: Finglish + In-Context Learning + hint single

In [None]:
def prompt12(word, phoneme, grapheme):
  matches = None

  input_words = 0
  output_words = 0

  while not matches:
    messages = [
            {
                "role": "system",
                "content": '''You are an assistant that converts Persian sentences into their Finglish representation.

                ## Transliteration Guidelines
                1. Accurately represent the pronunciation of Persian words.
                2. Use hyphens to connect words with Ezafe when needed (e.g., "رنگ آبی": "rang-e aabi", "زندگی شیرین": "zendegi-ye Shirin").
                3. Use "Sh" for 'ش', "Ch" for 'چ', "Kh" for 'خ', "Gh" for 'ق' and 'غ', "Zh" for 'ژ'.

                Additional guidelines:
                - Short vowels: a (ـَ), e (ـِ), o (ـُ)
                - Long vowels: aa (آ/ا), i (ای), oo (او)
                - Use 'z' for ز, ذ, ض, and ظ; 's' for س and ص; 't' for ت and ط; 'y' for ی; 'j' for ج; and 'k' for ک
                - Omit silent 'h' at the end of words (e.g., خانه → Khaane, not Khaaneh)
                - Represent ع, ئ , and ء with an apostrophe ' when it's pronounced

                In the inputs you will be given, the Finglish of some of the words is given as help.

                ## Examples
                input=[جشن چهل مرد]. dict entries: 'چهل': Chehel, 'جشن': jaShn, output=[jaShn-e Chehel mard].
                input=[گل نو در غار هست یا خانه؟]. dict entries: 'غار': Ghaar, 'خانه': Khaane. output=[ɡol-e no dar Ghaar hast ya Khaane]
                input=[ژن زیبارویان پولدار]. dict entries: 'ژن': Zhen, 'زیبا': zibaa. output=[Zhen-e zibaarooyaan-e pooldaar]
                input=[اتفاقی نمی‌افتد]. dict entries: . output=[ettefaaGhi nemi-oftad]
                input=[گرگ حیوانی وحشی است].dict entries: 'گرگ': gorg, 'وحشی': vahShi. output=[gorg heyvaani vahShi ast].
                '''
            },
            {
                "role": "user",
                "content": f'''This is the Persian sentence: [{grapheme}].
                These are the pronunciatin of some of the words I know:
                {get_known_words(grapheme, multiple_choices=False)}.
                And the pronunciation of "{word}" in this sentence is probably "{phoneme}".
                Return Finglish of the Persian sentenc in brackets like output=[].'''
            }]

    response = get_response(messages, 'gpt-4o', 'avalai')

    input_words = len(re.split('\w*', messages[0]['content'])) + len(re.split('\w*', messages[1]['content']))
    output_words = len(re.split('\w*', response))

    matches = re.findall(r'\[[^\]]+\]', response)

    if matches:
      output = matches[0].strip('[]')
      output = replace_LLM_phonetic_characters(output, output_to_phonetics_map)
      output = substitute_by_dict(output, grapheme)
      return output

    print("Matching failed")

# Inference

In [None]:
import csv
from google.colab import auth
import gspread
from google.auth import default

# Authenticate and authorize
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
# Input and output file paths
input_csv_path = "Human Labeling Sheet Links - Sheet1.csv"  # Replace with your input CSV file path
output_csv_path = "/content/drive/openai_phonemized_sentences.csv"  # Replace with your desired output CSV file path
temp_dir = "temp_sheets/"  # Temporary directory to save downloaded sheets


In [None]:
# Ensure the temporary directory exists
os.makedirs(temp_dir, exist_ok=True)

# Step 1: Download "Checked" sheets as CSV files
with open(input_csv_path, "r") as input_csv:
    reader = csv.DictReader(input_csv)

    for idx, row in enumerate(reader):
        if idx == 0: continue

        sheet_name = row["Sheet Name"]
        link = row["Link"]
        checked = row["Checked"].strip().lower() == "true"

        if not link or not checked:
            continue  # Skip rows with an empty link or not marked as Checked

        # try:
        # Open the Google Sheet
        sheet = gc.open_by_url(link)
        worksheet = sheet.get_worksheet(0)

        # Get all rows from the worksheet
        data = worksheet.get_all_values()

        # Save the sheet as a temporary CSV file
        temp_csv_path = os.path.join(temp_dir, f"{sheet_name}.csv")
        with open(temp_csv_path, "w", newline="") as temp_csv:
            temp_writer = csv.writer(temp_csv)
            temp_writer.writerows(data)

        # except Exception as e:
        #     print(f"Error downloading sheet {sheet_name} at {link}: {e}")

print(f"All Checked sheets downloaded to {temp_dir}.")


All Checked sheets downloaded to temp_sheets/.


In [None]:
# Step 2: Process downloaded CSV files
# Load existing sentences from the output file
existing_sentences = set()
try:
    with open(output_csv_path, "r") as output_csv:
        reader = csv.DictReader(output_csv)
        for row in reader:
            existing_sentences.add(row["Sentence"])
except FileNotFoundError:
    pass  # Output file does not exist yet

# Prepare the output CSV file
with open(output_csv_path, "a", newline="") as output_csv:
    fieldnames = ["Word", "Sentence", "Output"]
    writer = csv.DictWriter(output_csv, fieldnames=fieldnames)

    # Write the header only if the file is empty
    if output_csv.tell() == 0:
        writer.writeheader()

    # Process each downloaded CSV file
    for temp_csv_file in os.listdir(temp_dir):
        temp_csv_path = os.path.join(temp_dir, temp_csv_file)

        with open(temp_csv_path, "r") as temp_csv:
            reader = csv.reader(temp_csv)
            data = list(reader)

            # Initialize last non-English word
            last_non_english_word = ""

            # Process rows starting from the second row (skip header)
            for row_data in data:
                word = row_data[0]  # First column
                sentence = row_data[1]  # Second column

                # Check if the word is non-English (basic check using regex for non-ASCII characters)
                if re.search(r"[^\x00-\x7F]", word):
                    last_non_english_word = word

                if sentence and sentence not in existing_sentences:  # Check if the second column is not empty and not already processed
                    output = prompt12(last_non_english_word, word, sentence)
                    print(last_non_english_word, word, output)
                    writer.writerow({"Word": last_non_english_word, "Sentence": sentence, "Output": output})
                    existing_sentences.add(sentence)  # Add to the set of processed sentences

print(f"Processing complete. Results saved to {output_csv_path}.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
prompted!
وسطی vostA vostA kolA sarsabz ?ast
prompted!
وسطی vostA vostA kola Sarji ?ast
prompted!
قوت qovvat man qazA mixoram qovvat begiram
prompted!
قوت qovvat qovvat dar bAzuye mardAn ?ast
prompted!
قوت qovvat mardAn qovvate qalbe hamsarAn hastand
prompted!
قوت qovvat mAdaram qovvate qalbe man ?ast
prompted!
قوت qovvat hamsaram qovvate qalbe farzandam ?ast
prompted!
قوت qut man quti rA be ?eSqAli ?andAxtam
prompted!
قوت qut qutiye man surAx ?ast
prompted!
قوت qut ?in quti be dard nemixorad
prompted!
قوت qut man qutiye bozorgi mikhAm
prompted!
کارد kArd dustam bA kArd ?Adam koSt
prompted!
کارد kArd dastam rA bA kArd beridam
prompted!
کارد kArd kArd tiz ?ast
prompted!
کارد kArd ?esteak bA kArd va CangAl serv miSavad
prompted!
کارد kArd bA kArd dastam rA beridam
prompted!
کارد kArad ?u bAyad deraxt bekArad
prompted!
کارد kArad dar ruze deraxtkAri hasan bAyad deraxt bekArad
prompted!
کارد kArad ?u bAyad gol rA bekArad
prom