5 months ago · dca5537078
--- a/testing-scripts/test.ipynb
+++ b/testing-scripts/test.ipynb
--- a/training-scripts/finetune-ge2pe.py
+++ b/training-scripts/finetune-ge2pe.py
@@ -0,0 +1,381 @@
 # %%
 import os
 import pandas as pd
 import numpy as np
 import evaluate
 from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
 from dataclasses import dataclass
 from typing import Union, Dict, List

 import pandas as pd
 import numpy as np
 from datasets import Dataset
 import argparse
 import torch
 import evaluate

 import os
 from dataclasses import dataclass
 from typing import Union, Dict, List, Optional
 from transformers import AdamW, AutoTokenizer, T5ForConditionalGeneration, T5Config
 from transformers import (
    DataCollator,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
 )


 os.environ["WANDB_DISABLED"] = "true"

 # %%
 set_seed(41)

 # %%
 def prepare_dataset(batch):

    batch['input_ids'] = batch['Grapheme']
    batch['labels'] = batch['Mapped Phoneme']

    return batch

 # %%
 # Data collator for padding
@dataclass
 class DataCollatorWithPadding:
    tokenizer: AutoTokenizer
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        words = [feature["input_ids"] for feature in features]
        prons = [feature["labels"] for feature in features]
        batch = self.tokenizer(words, padding=self.padding, add_special_tokens=False, return_attention_mask=True, return_tensors='pt')
        pron_batch = self.tokenizer(prons, padding=self.padding, add_special_tokens=True, return_attention_mask=True, return_tensors='pt')
        batch['labels'] = pron_batch['input_ids'].masked_fill(pron_batch.attention_mask.ne(1), -100)
        return batch

 # %%
 # Compute metrics (CER and WER)
 def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer, 'wer': wer}

 # setting the evaluation metrics
 cer_metric = evaluate.load("cer")
 wer_metric = evaluate.load('wer')

 # %% [markdown]
 # # Phase 1

 # %%
 def load_pronuncation_dictionary(path, train=True, homograph_only=False, human=False) -> Dataset:
    # path = '/media/external_10TB/mahta_fetrat/PersianG2P_final.csv'

    # Read the CSV file
    df = pd.read_csv(path, index_col=[0])

    if homograph_only:
        if human:
            df = df[df['Source'] == 'human']
        if not human:
            df = df[df['Source'] != 'human']

    # Drop unnecessary columns
    df = df.drop(['Source', 'Source ID'], axis=1)

    # Drop rows where 'Phoneme' is NaN
    df = df.dropna(subset=['Mapped Phoneme'])

    # Filter rows based on phoneme length
    Plen = np.array([len(i) for i in df['Mapped Phoneme']])
    df = df.iloc[Plen < 512, :]

    # Filter rows based on 'Homograph Grapheme' column
    if homograph_only:
        df = df[df['Homograph Grapheme'].notna() & (df['Homograph Grapheme'] != '')]
    else:
        df = df[df['Homograph Grapheme'].isna() | (df['Homograph Grapheme'] == '')]

    # Shuffle the DataFrame
    df = df.sample(frac=1)

    # Split into train and test sets
    if train:
        return Dataset.from_pandas(df.iloc[:len(df)-90, :])
    else:
        return Dataset.from_pandas(df.iloc[len(df)-90:, :])

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final.csv', train=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final.csv', train=False)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # Load tokenizer and model from checkpoint
 checkpoint_path = "checkpoint-320"  # Path to your checkpoint
 tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase1-30-ep",  # Directory to save the fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=5,  # Fewer epochs for this step
    learning_rate=5e-4,  # Default learning rate
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase1-30-ep")

 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase1-30-ep.png")

 # Optionally, close the plot to free up memory
 plt.close()

 # %% [markdown]
 # Phase 2

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final.csv',
                                          train=True,
                                          homograph_only=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final.csv',
                                        train=False,
                                        homograph_only=True)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # Load tokenizer and model from the previous fine-tuning step
 checkpoint_path = "./phase1-30-ep"  # Path to the model from Step 1
 tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase2-30-ep",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=30,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase2-30-ep")


 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase2-30-ep.png")

 # Optionally, close the plot to free up memory
 plt.close()

 # %% [markdown]
 # # Phase 3

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final_augmented_final.csv',
                                          train=True,
                                          homograph_only=True,
                                          human=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final_augmented_final.csv',
                                        train=False,
                                        homograph_only=True,
                                        human=True)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # Load tokenizer and model from the previous fine-tuning step
 checkpoint_path = "./phase2-30-ep"  # Path to the model from Step 1
 tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase3-30-ep",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=50,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase3-30-ep")


 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase3-30-ep.png")

 # Optionally, close the plot to free up memory
 plt.close()


--- a/training-scripts/finetune-t5.py
+++ b/training-scripts/finetune-t5.py
@@ -0,0 +1,397 @@
 # %%
 import os
 import pandas as pd
 import numpy as np
 import evaluate
 from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
 from dataclasses import dataclass
 from typing import Union, Dict, List

 import pandas as pd
 import numpy as np
 from datasets import Dataset
 import argparse
 import torch
 import evaluate

 import os
 from dataclasses import dataclass
 from typing import Union, Dict, List, Optional
 from transformers import AdamW, AutoTokenizer, T5ForConditionalGeneration, T5Config
 from transformers import (
    DataCollator,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
 )

 os.environ["WANDB_DISABLED"] = "true"

 # %%
 set_seed(41)

 # %%
 def prepare_dataset(batch):

    batch['input_ids'] = batch['Grapheme']
    batch['labels'] = batch['Mapped Phoneme']

    return batch

 # %%
 # Data collator for padding
@dataclass
 class DataCollatorWithPadding:
    tokenizer: AutoTokenizer
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        words = [feature["input_ids"] for feature in features]
        prons = [feature["labels"] for feature in features]
        batch = self.tokenizer(words, padding=self.padding, add_special_tokens=False, return_attention_mask=True, return_tensors='pt')
        pron_batch = self.tokenizer(prons, padding=self.padding, add_special_tokens=True, return_attention_mask=True, return_tensors='pt')
        batch['labels'] = pron_batch['input_ids'].masked_fill(pron_batch.attention_mask.ne(1), -100)
        return batch

 # %%
 # Compute metrics (CER and WER)
 def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer, 'wer': wer}

 # setting the evaluation metrics
 cer_metric = evaluate.load("cer")
 wer_metric = evaluate.load('wer')

 # %% [markdown]
 # # Phase 1

 # %%
 def load_pronuncation_dictionary(path, train=True, homograph_only=False, human=False) -> Dataset:
    # path = 'PersianG2P_final.csv'

    # Read the CSV file
    df = pd.read_csv(path, index_col=[0])

    if homograph_only:
        if human:
            df = df[df['Source'] == 'human']
        if not human:
            df = df[df['Source'] != 'human']

    # Drop unnecessary columns
    df = df.drop(['Source', 'Source ID'], axis=1)

    # Drop rows where 'Phoneme' is NaN
    df = df.dropna(subset=['Mapped Phoneme'])

    # Filter rows based on phoneme length
    Plen = np.array([len(i) for i in df['Mapped Phoneme']])
    df = df.iloc[Plen < 512, :]

    # Filter rows based on 'Homograph Grapheme' column
    if homograph_only:
        df = df[df['Homograph Grapheme'].notna() & (df['Homograph Grapheme'] != '')]
    else:
        df = df[df['Homograph Grapheme'].isna() | (df['Homograph Grapheme'] == '')]

    # Shuffle the DataFrame
    df = df.sample(frac=1)

    # Split into train and test sets
    if train:
        return Dataset.from_pandas(df.iloc[:len(df)-90, :])
    else:
        return Dataset.from_pandas(df.iloc[len(df)-90:, :])

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final.csv', train=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final.csv', train=False)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # # Load tokenizer and model from checkpoint
 # checkpoint_path = "checkpoint-320"  # Path to your checkpoint
 # tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 # model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
 # # Load tokenizer and model from checkpoint
 # checkpoint_path = "checkpoint-320"  # Path to your checkpoint
 tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
 # model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 config = T5Config.from_pretrained('google/byt5-small')

 config.num_decoder_layers = 2
 config.num_layers = 2
 config.d_kv = 64
 config.d_model = 512
 config.d_ff = 512

 print('Initializing a ByT5 model...')
 model = T5ForConditionalGeneration(config)


 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase1-t5",  # Directory to save the fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=5,  # Fewer epochs for this step
    learning_rate=5e-4,  # Default learning rate
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
    remove_unused_columns=False,
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase1-t5")

 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase1-t5.png")

 # Optionally, close the plot to free up memory
 plt.close()

 # %% [markdown]
 # # Phase 2

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final.csv',
                                          train=True,
                                          homograph_only=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final.csv',
                                        train=False,
                                        homograph_only=True)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # Load tokenizer and model from the previous fine-tuning step
 checkpoint_path = "./phase1-t5"  # Path to the model from Step 1
 tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase2-t5",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=30,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase2-t5")


 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase2-t5.png")

 # Optionally, close the plot to free up memory
 plt.close()

 # %% [markdown]
 # # Phase 3

 # %%
 # Load datasets (only rows with 'Homograph Grapheme')
 train_data = load_pronuncation_dictionary('PersianG2P_final_augmented_final.csv',
                                          train=True,
                                          homograph_only=True,
                                          human=True)
 train_data = train_data.map(prepare_dataset)
 train_dataset = train_data

 dev_data = load_pronuncation_dictionary('PersianG2P_final_augmented_final.csv',
                                        train=False,
                                        homograph_only=True,
                                        human=True)
 dev_data = dev_data.map(prepare_dataset)
 dev_dataset = dev_data

 # Load tokenizer and model from the previous fine-tuning step
 checkpoint_path = "./phase2-t5"  # Path to the model from Step 1
 tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
 model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

 # Data collator
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 # Training arguments (default values)
 training_args = Seq2SeqTrainingArguments(
    output_dir="./phase3-t5",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=50,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
 )

 # Trainer
 trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
 )

 # Fine-tune the model
 trainer.train()

 # Save the fine-tuned model
 trainer.save_model("./phase3-t5")


 # %%
 import matplotlib.pyplot as plt

 # Extract training and validation loss from the log history
 train_loss = []
 val_loss = []
 for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

 # Plot the training and validation loss
 plt.figure(figsize=(10, 6))
 plt.plot(train_loss, label="Training Loss", marker="o")
 plt.plot(val_loss, label="Validation Loss", marker="o")
 plt.xlabel("Steps")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
 plt.grid()

 # Save the plot to disk
 plt.savefig("phase3-t5.png")

 # Optionally, close the plot to free up memory
 plt.close()