Browse Source

Initial commit

master
a.h.hadian 3 weeks ago
commit
3c34d42558

+ 3
- 0
Classification/README.md View File

@@ -0,0 +1,3 @@
# Senetnece Classification Task on GLUE Benchmark

Details comming soom!

+ 52
- 0
Classification/config.py View File

@@ -0,0 +1,52 @@
import argparse
import torch
from media import media_path

class Config:
def __init__(self):
self.parser = argparse.ArgumentParser()
self.add_arguments()
self.args = self.parse()
self.post_process()
def parse(self):
return self.parser.parse_args()
def add_arguments(self):
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training')
self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available')
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability')

self.parser.add_argument('--batch_size', type=int, default=16, help='batch size for training ')
self.parser.add_argument('--virtual_batch_size', type=int, default=16, help='batch size for updating model parameters')
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training')
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate')
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer')
self.parser.add_argument('--optimizer_eps', type=float, default=1e-8, help='optimizer eps')
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1")
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps')

self.parser.add_argument('--max_length', type=int, default=128, help='Max length for tokenization')
self.parser.add_argument('--peft_mode', type=str, default='lora', choices=['lora', 'bitfit', 'full', 'lorabitfit'], help='PEFT mode for fine-tuning')
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora')
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora')
self.parser.add_argument('--dataset', type=str, default='sst2', choices=['sst2', 'mnli', 'qqp', 'qnli'], help='Dataset name')
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training')

self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1")
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget")
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget")
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning')
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm")

self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1")
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name")
self.parser.add_argument("--run_name", type=str, default=None, help="run name")

self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes lora then bitfit")

def post_process(self):
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size"
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu")
self.args.media_path = media_path

+ 107
- 0
Classification/main.py View File

@@ -0,0 +1,107 @@
from config import Config
from src.model import prepare_model
from src.data import prepare_data
from src.train import Trainer
import os
import random
import numpy as np
import torch
import wandb
import logging
import transformers
import warnings

warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes ")

transformers.logging.set_verbosity_error()

def set_seeds(seed: int):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
transformers.set_seed(seed)

def copy_model_weights(model1, model2):
model1.eval()
model2.eval()
params1 = model1.parameters()
params2 = model2.parameters()
with torch.no_grad():
for param1, param2 in zip(params1, params2):
param2.data.copy_(param1.data)

# Returns number of trainbale parameters of the model
def get_number_of_trainable_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Returns number of parameters of the model
def get_number_of_parameters(model):
return sum(p.numel() for p in model.parameters())


def main(cfg):
set_seeds(cfg.seed)

model, tokenizer = prepare_model(cfg)
num_of_all_params = get_number_of_parameters(model)
num_of_trainbale_params = get_number_of_trainable_parameters(model)
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2)
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}")
logging.info(f"Percentage of trainable parameters: {percentage} %")
train_loader, val_loader_one, val_loader_two = prepare_data(cfg, tokenizer)
logging.info("Data is ready")

trainer = Trainer(cfg, model, train_loader)
trainer.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two)
if cfg.two_step_training:
if cfg.dp:
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model)
model_two, _ = prepare_model(cfg)
copy_model_weights(model, model_two)
del model
model = model_two
for a, b in model.roberta.named_parameters():
if 'bias' in a:
b.requires_grad = True
else:
b.requires_grad = False
logging.info("New Model adjusted")
num_of_all_params = get_number_of_parameters(model)
num_of_trainbale_params = get_number_of_trainable_parameters(model)
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2)
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}")
logging.info(f"Percentage of trainable parameters: {percentage} %")
trainer_two = Trainer(cfg, model, train_loader, checkpoint="temp.pth")
trainer_two.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two)

if cfg.use_wandb:
wandb.finish()

if __name__ == "__main__":
cfg = Config().args

log_path = "logs/"
if not os.path.exists(log_path):
os.makedirs(log_path)
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log"

if cfg.use_wandb:
wandb.login(key="YOUR_KEY")
if cfg.run_name:
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name)
else:
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}")
log_file_name = wandb.run.name

logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logging.info("Start of the logging")
hyperparameters = {key: value for key, value in vars(cfg).items()}
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()])
logging.info("config:\n" + hyperparameters_str)

main(cfg)

+ 1
- 0
Classification/media.py View File

@@ -0,0 +1 @@
media_path = "YOUR_MEDIA_PATH"

+ 45
- 0
Classification/src/data.py View File

@@ -0,0 +1,45 @@
from datasets import load_from_disk
from torch.utils.data import DataLoader
from torch.utils.data import WeightedRandomSampler


TASK_TO_KEYS = {
"mnli": ("premise", "hypothesis"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"sst2": ("sentence", None),
}

def prepare_data(cfg, tokenizer):
dataset = load_from_disk(f"{cfg.media_path}saved_datasets/{cfg.dataset}")
sentence1_key, sentence2_key = TASK_TO_KEYS[cfg.dataset]

if cfg.toy_example:
dataset["train"] = dataset["train"].select(range(1024))

def tokenize(batch):
args = ((batch[sentence1_key],) if sentence2_key is None else (batch[sentence1_key], batch[sentence2_key]))
return tokenizer(*args, padding="max_length", truncation=True, max_length=cfg.max_length)

dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset))
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

cfg.train_data_size = len(dataset['train'])

sampler = WeightedRandomSampler([cfg.virtual_batch_size/cfg.train_data_size for _ in range(cfg.train_data_size)], num_samples=cfg.train_data_size, replacement=True)
train_loader = DataLoader(dataset['train'], batch_size=cfg.virtual_batch_size, sampler=sampler, drop_last=True)

validation_loader_one = None
validation_loader_two = None
if cfg.dataset == "mnli":
if cfg.toy_example:
dataset["validation_matched"] = dataset["validation_matched"].select(range(100))
dataset["validation_mismatched"] = dataset["validation_mismatched"].select(range(100))
validation_loader_one = DataLoader(dataset['validation_matched'], batch_size=cfg.batch_size)
validation_loader_two = DataLoader(dataset['validation_mismatched'], batch_size=cfg.batch_size)
else:
if cfg.toy_example:
dataset["validation"] = dataset["validation"].select(range(100))
validation_loader_one = DataLoader(dataset['validation'], batch_size=cfg.batch_size)

return train_loader, validation_loader_one, validation_loader_two

+ 91
- 0
Classification/src/model.py View File

@@ -0,0 +1,91 @@
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import logging
import torch


def prepare_model(cfg):
tokenizer = RobertaTokenizer.from_pretrained(f"{cfg.media_path}models/roberta-large-tokenizer")
model = RobertaForSequenceClassification.from_pretrained(f"{cfg.media_path}models/roberta-large-model")
if cfg.dataset == 'mnli':
model.classifier.out_proj = torch.nn.Linear(model.classifier.out_proj.in_features, 3, bias=True)
# adjust model parameters
if cfg.peft_mode == "lora":
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha)
freeze_non_LoRA(model.roberta, peft_key='sharif_llm')
logging.info("LoRA model loaded")
elif cfg.peft_mode == "bitfit":
for a, b in model.roberta.named_parameters():
if not 'bias' in a:
b.requires_grad = False
logging.info("BiTFiT model loaded")
elif cfg.peft_mode == "lorabitfit":
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha)
freeze_non_LoRA(model.roberta, peft_key='sharif_llm')
if cfg.two_step_training == 0:
for a, b in model.roberta.named_parameters():
if 'bias' in a:
b.requires_grad = True
logging.info("LoRA and BiTFiT combined model loaded")
elif cfg.peft_mode == "full":
logging.info("Full model loaded")
else:
logging.info("No acceptable model to load")
model.to(cfg.device)
return model, tokenizer


class LoRALayer(torch.nn.Module):
def __init__(
self,
module: torch.nn.Linear,
rank: int ,
alpha: float
):
super().__init__()
self.rank = rank
self.alpha = alpha
self.scaling = self.alpha / self.rank # scaling factor
self.in_dim = module.in_features
self.out_dim = module.out_features
self.pretrained = module

self.sharif_llm_A = torch.nn.Linear(self.in_dim, self.rank, bias=False)
torch.nn.init.kaiming_normal_(self.sharif_llm_A.weight)
self.sharif_llm_B = torch.nn.Linear(self.rank, self.out_dim, bias=False)
torch.nn.init.zeros_(self.sharif_llm_B.weight)

def forward(self, x: torch.Tensor):
pretrained_out = self.pretrained(x)
lora_out = self.sharif_llm_A(x) # x@A
lora_out = self.sharif_llm_B(lora_out) # x@A@B
lora_out = self.scaling * lora_out # Scale by the scaling factor
return pretrained_out + lora_out # x@W + x@A@B*(scaling_factor)

def mutate_model(model: torch.nn.Module, rank: int, alpha: float):
"""
Replaces all linear layers in the model with LoRALinear layers.
Freeze all params except LoRA params.
"""
# make sure there are no LoRALayer is in the model; return if there are any
for name, module in model.named_modules():
if isinstance(module, LoRALayer):
logging.info("Model already contains LoRALinear layers! \n Try reloading the model.")
return

# we want to replace all query and value Linear modules with LoRALayer
for name, module in model.named_children():
# if the module is linear and the name is for query or value
if isinstance(module, torch.nn.Linear) and (name == 'query' or name == 'value'):
# replace the module with LoRALayer
lora_layer = LoRALayer(module, rank, alpha)
setattr(model, name, lora_layer)
else:
mutate_model(module, rank, alpha) # recursively call the function on the module


def freeze_non_LoRA(model, peft_key):
for param_name, weights in model.named_parameters():
weights.requires_grad = peft_key in param_name

+ 147
- 0
Classification/src/train.py View File

@@ -0,0 +1,147 @@
from transformers import get_linear_schedule_with_warmup
import logging
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import wandb
import math
from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager


class Trainer:
def __init__(self, cfg, model, train_loader, checkpoint=None):
self.criterion = CrossEntropyLoss()
self.val_criterion = CrossEntropyLoss()
self.optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, eps=cfg.optimizer_eps)
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size
total_steps = math.ceil(len(train_loader) / self.gradient_accumulation_steps) * cfg.epochs

if cfg.scheduler:
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=cfg.scheduler_warmup_ratio*total_steps, num_training_steps=total_steps)
self.dp = cfg.dp
self.model = model
self.cfg = cfg

if cfg.dp:
self.model.train()
self.privacy_engine = PrivacyEngine(
accountant="rdp",
)
if checkpoint:
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model)
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon(
module=self.model,
optimizer=self.optimizer,
data_loader=train_loader,
target_epsilon=cfg.epsilon,
target_delta=cfg.delta,
epochs=cfg.epochs,
max_grad_norm=cfg.clipping_threshold,
)
def train_step(self, train_loader):
train_loss = 0

self.model.train()
self.optimizer.zero_grad()

if self.cfg.dp:
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader:
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)):
# Move batch tensors to the same device as the model
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}

# Forward pass
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"])
loss = self.criterion(outputs.logits, batch["label"])

loss.backward()
train_loss += loss.mean().item()
self.optimizer.step()
self.optimizer.zero_grad()
if self.cfg.scheduler:
self.scheduler.step()
else:
for batch_number, batch in tqdm(enumerate(train_loader, 1), total=len(train_loader)):
# Move batch tensors to the same device as the model
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}

# Forward pass
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"])
loss = self.criterion(outputs.logits, batch["label"])

loss.backward()
train_loss += loss.mean().item()
self.optimizer.step()
self.optimizer.zero_grad()
if self.cfg.scheduler:
self.scheduler.step()

return train_loss/len(train_loader)

def evaluate_step(self, val_loader):
# Evaluation loop
val_loss = 0
self.model.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in tqdm(val_loader):
# Move batch tensors to the same device as the model
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}

# Forward pass and compute validation loss
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"])
loss = self.val_criterion(outputs.logits, batch["label"])
_, preds = torch.max(outputs.logits, dim=1)
predictions.extend(preds.tolist())
true_labels.extend(batch["label"].tolist())
val_loss += loss.item()
accuracy = accuracy_score(true_labels, predictions)
return accuracy , val_loss/len(val_loader)

def train_and_evaluate(self, epochs, train_loader, val_loader_one, val_loader_two):
best_accuracy = 0
best_accuracy_two = 0

wandb_log = []

for epoch in range(epochs):
log_data = {}
train_loss = self.train_step(train_loader)
log_data["train_loss"] = train_loss
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}")
accuracy, val_loss = self.evaluate_step(val_loader=val_loader_one)
log_data["validation_loss"] = val_loss
log_data["accuracy"] = accuracy
if accuracy > best_accuracy:
best_accuracy = accuracy
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}")
logging.info(f"Accuracy on validation set: {accuracy * 100} %")
if val_loader_two:
accuracy_two , val_loss_two = self.evaluate_step(val_loader=val_loader_two)
log_data["validation_two_loss"] = val_loss_two
log_data["accuracy_two"] = accuracy_two
if accuracy_two > best_accuracy_two:
best_accuracy_two = accuracy_two
logging.info(f"Epoch {epoch+1} Validation two loss: {val_loss_two}")
logging.info(f"Accuracy on validation two set: {accuracy_two * 100} %")
wandb_log.append(log_data)
logging.info("Best results:")
if self.cfg.dp:
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta))
logging.info(f"Best validatin accuracy: {best_accuracy}")
if val_loader_two:
logging.info(f"Second validation set accuracy: {best_accuracy_two}")
if self.cfg.use_wandb:
for i, epoch_data in enumerate(wandb_log):
wandb.log(epoch_data)

+ 3
- 0
Generation/README.md View File

@@ -0,0 +1,3 @@
# Text Generation Task on E2E dataset

Details comming soom!

+ 68
- 0
Generation/config.py View File

@@ -0,0 +1,68 @@
import argparse
import torch
from media import *

class Config:
def __init__(self):
self.parser = argparse.ArgumentParser()
self.add_arguments()
self.args = self.parse()
self.post_process()
def parse(self):
return self.parser.parse_args()
def add_arguments(self):
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training')
# self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available')
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability')

self.parser.add_argument('--batch_size', type=int, default=8, help='batch size for training')
self.parser.add_argument('--virtual_batch_size', type=int, default=8, help='batch size for updating model parameters')
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training')
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate')
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer')
self.parser.add_argument('--optimizer_eps', type=float, default=1e-6, help='optimizer eps')
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1")
self.parser.add_argument("--scheduler_type", type=str, default="linear", choices=['linear', 'steplr'], help="Scheduler types")
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps')
self.parser.add_argument('--scheduler_warmup_steps', type=int, default=None, help='Warmup steps can be given directly')
self.parser.add_argument('--scheduler_step_size', type=int, default=1, help='Scheduler step size for stepLR scheduler')
self.parser.add_argument('--scheduler_gamma', type=float, default=0.5, help='Scheduler decrease rate for stepLR scheduler')

self.parser.add_argument('--model_name', type=str, default='gpt2', choices=['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'], help='PEFT mode for fine-tuning')
self.parser.add_argument('--seq_length', type=int, default=128, help='Max length for tokenization')
self.parser.add_argument('--peft_mode', type=str, default='bitfit', choices=['lora', 'bitfit', 'full', 'lorabitfit', 'adapter', 'adapterbitfit'], help='PEFT mode for fine-tuning')
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora')
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora')
self.parser.add_argument('--drop_out', type=float, default=0.0, help='Dropout for lora')
self.parser.add_argument('--reduction_factor', type=int, default=16, help='Reduction_factor for adapter')
self.parser.add_argument('--dataset', type=str, default='e2e_nlg', choices=['e2e_nlg', 'dart'], help='Dataset name')
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training')

self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1")
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget")
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget")
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning')
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm")

self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1")
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name")
self.parser.add_argument("--run_name", type=str, default=None, help="run name")

self.parser.add_argument("--beam_size", type=int, default=5, help="Number of beans for generation")

self.parser.add_argument('--f', type=str, default=None, help='Path to Jupyter kernel JSON file')

self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes adapter or lora then bitfit")
self.parser.add_argument('--lr_two', type=float, default='2e-3', help='Learning rate for second step of training')
self.parser.add_argument('--virtual_batch_size_two', type=int, default=8, help='batch size for updating model parameters for scond step of training')
self.parser.add_argument('--epochs_two', type=int, default=5, help='Number of epochs for second step training')
self.parser.add_argument('--weight_decay_two', type=float, default=0.1, help='Weight decay for second optimizer')

def post_process(self):
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size"
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu")
self.args.media_path = media_path
self.args.model_cache_path = model_cache_path

+ 309
- 0
Generation/data.py View File

@@ -0,0 +1,309 @@
from datasets import load_from_disk
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler
import copy
import sys
import torch
from torch.utils.data.dataset import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Tuple, Union
from torch.nn.utils.rnn import pad_sequence
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding

def load_dataset(dataset_name, path, toy_example):
dataset = load_from_disk(f"{path}saved_datasets/{dataset_name}")
# toy example for develop
if toy_example == 1:
dataset["train"] = dataset["train"].select(range(1024))
dataset["validation"] = dataset["validation"].select(range(512))
return dataset


def load_dataloaders(dataset, dataset_name, batch_size, virtual_batch_size, tokenizer, seq_length, dp=1):
data_collator = DataCollatorForData2TextLanguageModeling(tokenizer)
if dataset_name == 'e2e_nlg':
train_dataset = E2ETextDataset(tokenizer,
dataset["train"]["meaning_representation"],
dataset["train"]["human_reference"],
seq_length,
tokenizer.bos_token,
tokenizer.eos_token,
seq_length)
validation_dataset = E2ETextDataset(tokenizer,
dataset["validation"]["meaning_representation"],
dataset["validation"]["human_reference"],
seq_length,
tokenizer.bos_token,
tokenizer.eos_token,
seq_length)

train_data_size = len(dataset["train"])
if dp == 1:
sampler = WeightedRandomSampler([virtual_batch_size/train_data_size for _ in range(train_data_size)], num_samples=train_data_size, replacement=True)
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, sampler=sampler, drop_last=True, collate_fn=data_collator)
else:
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, collate_fn=data_collator)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator)
elif dataset_name == 'dart':
pass

return train_loader, validation_loader


# Copyright (c) Xuechen Li. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

class E2ETextDataset(Dataset):

def __init__(
self,
tokenizer: PreTrainedTokenizer,
src_lines,
tgt_lines,
block_size: int,
bos_tok: str,
eos_tok: str,
max_seq_len=sys.maxsize,
max_examples=sys.maxsize,
**_,
):
src_lines = src_lines
tgt_lines = tgt_lines

edited_sents = []
for src, tgt in zip(src_lines, tgt_lines):
sent = ' {} {} '.format(src, bos_tok) + tgt + ' {}'.format(eos_tok)
edited_sents.append(sent)

# --- Filter out super long sentences ---
new_src_lines, new_tgt_lines, new_edited_sents = [], [], []
for src_line, tgt_line, edited_sent in zip(src_lines, tgt_lines, edited_sents):
tokenized_edited_sent = tokenizer.tokenize(edited_sent)
if len(tokenized_edited_sent) <= max_seq_len:
new_src_lines.append(src_line)
new_tgt_lines.append(tgt_line)
new_edited_sents.append(edited_sent)
del src_line, tgt_line, edited_sent
src_lines, tgt_lines, edited_sents = new_src_lines, new_tgt_lines, new_edited_sents
# ---------------------------------------

# --- Truncate the dataset if necessary; this must be after the length filtering. ---
src_lines = src_lines[:max_examples]
tgt_lines = tgt_lines[:max_examples]
edited_sents = edited_sents[:max_examples]
# ---

batch_encoding = tokenizer(
edited_sents,
add_special_tokens=True,
truncation=True,
max_length=block_size,
is_split_into_words=False,
)

self.examples = batch_encoding["input_ids"]
self.labels = copy.deepcopy(self.examples)

# split into category words:
ssl_lst = []
for ss in src_lines:
ssl = [la.split(':')[0].strip() for la in ss.split('|')]
ssl_lst.append(ssl)

self.src_cat = tokenizer(
ssl_lst,
add_special_tokens=True,
truncation=True,
max_length=block_size,
is_split_into_words=True
)['input_ids']

self.src_sent = []
self.tgt_sent = []

# temp_src_len = 0
# temp_tgt_len = 0
# temp_count = 0

separator = tokenizer(bos_tok, add_special_tokens=False)['input_ids'][0]
for i, elem in enumerate(self.labels):
sep_idx = elem.index(separator) + 1
self.src_sent.append(self.examples[i][:sep_idx - 1])
self.tgt_sent.append(self.examples[i][sep_idx - 1:])
self.labels[i][:sep_idx] = [-100] * sep_idx # Doesn't contribute to loss.
# temp_src_len += sep_idx - 1
# temp_tgt_len += len(elem) - (sep_idx - 1)
# temp_count += 1

# print('tgt_avg: ', temp_tgt_len / temp_count)
# print('src_avg: ', temp_src_len / temp_count)
# print('ratios: ', temp_src_len / temp_tgt_len)

# print(self.labels[0])
# print(self.examples[0])
# print(edited_sents[0])
# print(self.src_sent[0])
# print(self.tgt_sent[0])
# print(self.src_cat[0])
assert len(self.src_cat) == len(self.examples)

def __len__(self):
return len(self.examples)

def __getitem__(self, i):
return (
torch.tensor(self.examples[i], dtype=torch.long),
torch.tensor(self.labels[i], dtype=torch.long),
torch.tensor(self.src_sent[i], dtype=torch.long),
torch.tensor(self.tgt_sent[i], dtype=torch.long),
torch.tensor(self.src_cat[i], dtype=torch.long),
)


# InputDataClass = NewType("InputDataClass", Any)

"""
A DataCollator is a function that takes a list of samples from a Dataset
and collate them into a batch, as a dictionary of Tensors.
"""
# DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])


@dataclass
class DataCollatorForData2TextLanguageModeling:
"""
Data collator used for language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
"""
tokenizer: PreTrainedTokenizer
mlm: bool = False
format_mode: str = 'cat'
mlm_probability: float = 0.15

def __call__(
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
if isinstance(examples[0], (dict, BatchEncoding)):
examples = [e["input_ids"] for e in examples]
input_ids, labels, src, tgt, cate = zip(*examples)
if self.mlm:
inputs, labels = self.mask_tokens(batch)
return {"input_ids": inputs, "labels": labels}
else:
if self.format_mode == 'cat':
mode_input = 3
elif self.format_mode == 'peek':
mode_input = 1
elif self.format_mode == 'nopeek':
mode_input = 2
elif self.format_mode == 'infix':
mode_input = 4

# mode_input = 1 # means that we take the input again.
# mode_input = 2 # means that we do not peek at src again.
# mode_input = 3 # means that we look at the categories, and see the input again.

if mode_input == 1:
# input, batch
batch = self._tensorize_batch(input_ids)
labels = self._tensorize_batch(labels)
src = self._tensorize_batch(src)
cate_batch, cate_attn = None, None
# tgt = self._tensorize_batch(tgt)
elif mode_input == 2:
# nopeek.
batch = self._tensorize_batch(tgt)
labels = batch.clone()
src = self._tensorize_batch(src)
cate_batch, cate_attn = None, None
elif mode_input == 3:
batch = self._tensorize_batch(input_ids)
labels = self._tensorize_batch(labels)
src = self._tensorize_batch(cate)
cate_batch, cate_attn = None, None
elif mode_input == 4:
batch = self._tensorize_batch(tgt)
labels = batch.clone()
src = self._tensorize_batch(src)

cate_batch = self._tensorize_batch(cate)
cate_attn = (cate_batch != self.tokenizer.pad_token_id)

labels[labels == self.tokenizer.pad_token_id] = -100 # tgt
src_attn = (src != self.tokenizer.pad_token_id) # src
tgt_attn = (batch != self.tokenizer.pad_token_id) # tgt

if cate_batch is None:
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn':tgt_attn,
'src':src}
else:
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn': tgt_attn,
'src': src, "cate_batch":cate_batch, "cate_attn":cate_attn}

def _tensorize_batch(
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
) -> torch.Tensor:
# In order to accept both lists of lists and lists of Tensors
if isinstance(examples[0], (list, tuple)):
examples = [torch.tensor(e, dtype=torch.long) for e in examples]
length_of_first = examples[0].size(0)
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length:
return torch.stack(examples, dim=0)
else:
if self.tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({self.tokenizer.__class__.__name__}) does not have one."
)
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)

def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""

if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
)

labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, self.mlm_probability)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
if self.tokenizer._pad_token is not None:
padding_mask = labels.eq(self.tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens

# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]

# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels

+ 5323
- 0
Generation/e2e_ref.txt
File diff suppressed because it is too large
View File


+ 125
- 0
Generation/main.py View File

@@ -0,0 +1,125 @@
from config import Config
import os
import random
import numpy as np
import torch
import wandb
import logging
import transformers
import warnings
import subprocess
from model import load_model, prepare_model, get_number_of_trainable_parameters, load_model_weights, get_number_of_parameters
from data import load_dataset, load_dataloaders
from train import Trainer, generate_evaluation_output, save_evaluation_output
from utils import clean_hyperparameters, copy_model_weights

warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes")

transformers.logging.set_verbosity_error()

def set_seeds(seed: int):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
transformers.set_seed(seed)


def run_metric_script(file_path):
result = subprocess.run(["e2e/measure_scores.py", "-p", "e2e_ref.txt", file_path], stdout=subprocess.PIPE)
output = result.stdout.decode('utf-8')
lines = output.split('\n')
return lines[-7:-2]


def main(cfg):
set_seeds(cfg.seed)

model, tokenizer = load_model(cfg.model_name, cache_dir=cfg.model_cache_path)
model = prepare_model(model, cfg)
num_of_all_params = get_number_of_parameters(model)
num_of_trainbale_params = get_number_of_trainable_parameters(model)
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2)
logging.info(f"Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}")
logging.info(f"Percentage of trainable parameters: {percentage} %")

dataset = load_dataset(cfg.dataset, cfg.media_path, cfg.toy_example)
cfg.train_data_size = len(dataset["train"])
# dataset = tokenize_dataset(tokenizer, dataset, cfg.dataset, cfg.seq_length)
train_loader, validation_loader = load_dataloaders(dataset, cfg.dataset, cfg.batch_size, cfg.virtual_batch_size, tokenizer, cfg.seq_length, cfg.dp)
logging.info("Dataset loaded and tokenized")

trainer = Trainer(cfg, model, train_loader)
trainer.train_and_evaluate(cfg.epochs, train_loader, validation_loader)

if cfg.two_step_training and cfg.dp:
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model)
model_two, _ = load_model(cfg.model_name, cache_dir=cfg.model_cache_path)
model_two = prepare_model(model_two, cfg)
copy_model_weights(model, model_two)
del model
model = model_two
for a, b in model.named_parameters():
if 'bias' in a and not 'adapter' in a:
b.requires_grad = True
else:
b.requires_grad = False
logging.info("New Model adjusted")
num_of_all_params = get_number_of_parameters(model)
num_of_trainbale_params = get_number_of_trainable_parameters(model)
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2)
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}")
logging.info(f"Percentage of trainable parameters: {percentage} %")
trainer_two = Trainer(cfg, model, train_loader, second_trainer=True)
trainer_two.train_and_evaluate(cfg.epochs_two, train_loader, validation_loader)

# evaluate model on test data
model.eval()
model = load_model_weights(model, cfg.peft_mode, f"{trainer.save_path}/{trainer.model_name}.pth")
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size)
output_path = f"{cfg.media_path}generation_eval_outputs/{cfg.dataset}/{cfg.peft_mode}"
if not os.path.exists(output_path):
os.makedirs(output_path)
output_name = cfg.run_name if cfg.run_name else "generation_output"
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v1.txt")
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size, do_sample=True)
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v2.txt")
logging.info("Generation for test data saved")
metrics = run_metric_script(f"{output_path}/{output_name}-v1.txt")
logging.info("Metrics without sampling:")
for metric in metrics:
logging.info(metric)
metrics = run_metric_script(f"{output_path}/{output_name}-v2.txt")
logging.info("Metrics with sampling:")
for metric in metrics:
logging.info(metric)

if cfg.use_wandb:
wandb.finish()

if __name__ == "__main__":
cfg = Config().args

log_path = f"logs/{cfg.dataset}/{cfg.peft_mode}/"
if not os.path.exists(log_path):
os.makedirs(log_path)
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log"

if cfg.use_wandb:
wandb.login(key="YOUR_KEY")
if cfg.run_name:
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name)
else:
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}")
log_file_name = wandb.run.name

logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logging.info("Start of the logging")
hyperparameters = {key: value for key, value in vars(cfg).items()}
hyperparameters = clean_hyperparameters(hyperparameters)
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()])
logging.info("config:\n" + hyperparameters_str)

main(cfg)

+ 2
- 0
Generation/media.py View File

@@ -0,0 +1,2 @@
media_path = "YOUR_MEDIA_PATH"
model_cache_path = "YOUR_CACHE_PATH"

+ 299
- 0
Generation/model.py View File

@@ -0,0 +1,299 @@
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
import torch
import transformers
from torch import nn
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
import os


# Loads model and its tokenizer
def load_model(model_name, cache_dir="."):
tokenizer = GPT2Tokenizer.from_pretrained(f"{cache_dir}gpt2/{model_name}-tokenizer")
model = GPT2LMHeadModel.from_pretrained(f"{cache_dir}gpt2/{model_name}-model")
add_pad_token(model, tokenizer)
model.requires_grad_(False)
return model, tokenizer

# Adds padding token to the tokenizer and model embedding layer
def add_pad_token(model, tokenizer):
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
a = model.get_input_embeddings().weight
a.data[-1] = a.data[:-1].mean(dim=0)

# Returns number of trainbale parameters of the model
def get_number_of_trainable_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Returns number of parameters of the model
def get_number_of_parameters(model):
return sum(p.numel() for p in model.parameters())

# Mutates model structure and adjusts trainable parameters
def prepare_model(model, cfg):
if cfg.peft_mode == 'bitfit':
for a, b in model.named_parameters():
if 'bias' in a:
b.requires_grad = True
elif cfg.peft_mode == 'lora':
model.requires_grad_(True)
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out)
mark_only_lora_as_trainable(model)
elif cfg.peft_mode == 'lorabitfit':
model.requires_grad_(True)
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out)
mark_only_lora_as_trainable(model)
if cfg.two_step_training == 0:
for a, b in model.named_parameters():
if 'bias' in a:
b.requires_grad = True
elif cfg.peft_mode == 'full':
model.requires_grad_(True)
elif cfg.peft_mode == 'adapter':
model.requires_grad_(False)
bottleneck_size = model.config.n_embd // cfg.reduction_factor
mutate_model_adapter(model, bottleneck_size, model.config.n_embd)
for a, b in model.named_parameters():
if 'adapter' in a:
b.requires_grad = True
elif cfg.peft_mode == 'adapterbitfit':
model.requires_grad_(False)
bottleneck_size = model.config.n_embd // cfg.reduction_factor
mutate_model_adapter(model, bottleneck_size, model.config.n_embd)
if cfg.two_step_training == 0:
for a, b in model.named_parameters():
if 'adapter' in a or 'bias' in a:
b.requires_grad = True
else:
for a, b in model.named_parameters():
if 'adapter' in a:
b.requires_grad = True
model.to(cfg.device)
return model

def save_model(model, peft_mode, save_path, model_name):
if not os.path.exists(save_path):
os.makedirs(save_path)
if peft_mode == "bitfit":
bias_params = {}
for name, param in model.named_parameters():
if 'bias' in name:
bias_params[name] = param.data.clone()
torch.save(bias_params, f'{save_path}/{model_name}.pth')
elif peft_mode == 'lora':
lora_params = {}
for name, param in model.named_parameters():
if 'lora' in name:
lora_params[name] = param.data.clone()
torch.save(lora_params, f'{save_path}/{model_name}.pth')
elif peft_mode == 'lorabitfit':
lorabitfit_params = {}
for name, param in model.named_parameters():
if 'lora' in name or 'bias' in name:
lorabitfit_params[name] = param.data.clone()
torch.save(lorabitfit_params, f'{save_path}/{model_name}.pth')
elif peft_mode == 'full':
pass
elif peft_mode == 'adapter':
adapter_params = {}
for name, param in model.named_parameters():
if 'adapter' in name:
adapter_params[name] = param.data.clone()
torch.save(adapter_params, f'{save_path}/{model_name}.pth')
elif peft_mode == 'adapterbitfit':
adapterbitfit_params = {}
for name, param in model.named_parameters():
if 'adapter' in name or 'bias' in name:
adapterbitfit_params[name] = param.data.clone()
torch.save(adapterbitfit_params, f'{save_path}/{model_name}.pth')

def load_model_weights(model, peft_mode, path):
if peft_mode == 'full':
pass
else:
model_weights = torch.load(path)
with torch.no_grad():
for name, param in model.named_parameters():
if name in model_weights:
param.copy_(model_weights[name])

return model

# Copyright (c) Xuechen Li. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
LoRA layers.

This version does not have merged weights for zero latency inference. It makes the code easier to read and maintain.
Adapted from
https://github.com/microsoft/LoRA
https://www.microsoft.com/en-us/research/project/dp-transformers/
"""

class MYDPMergedLinear(nn.Module):
def __init__(
self,
in_features: int,
out_features: int,
pretrained_module,
lora_r=0,
lora_alpha=1.,
lora_dropout=0.,
):
super(MYDPMergedLinear, self).__init__()
self.pretrained_module = pretrained_module
self.lora_r = lora_r
self.lora_alpha = lora_alpha
self.lora_dropout = nn.Dropout(p=lora_dropout)
if self.lora_r > 0:
self.lora_A = nn.Linear(in_features=in_features, out_features=lora_r, bias=False)
self.lora_B = nn.Linear(in_features=lora_r, out_features=out_features, bias=False)
self.scaling = self.lora_alpha / lora_r
self.reset_parameters()

def forward(self, x: torch.Tensor):
result = self.pretrained_module(x)
if self.lora_r > 0:
after_dropout = self.lora_dropout(x)
after_A = self.lora_A(after_dropout)
after_B = self.lora_B(after_A)
result += after_B * self.scaling
return result

def reset_parameters(self):
# self.linear.reset_parameters()
if self.lora_r > 0:
self.lora_A.reset_parameters()
self.lora_B.weight.data.zero_()

@staticmethod
def from_transformers_conv1d(
original_layer,
lora_r=0,
lora_alpha=1.,
lora_dropout=0.,
) -> "MYDPMergedLinear":
lora_layer = MYDPMergedLinear(
in_features=original_layer.weight.shape[0],
out_features=original_layer.weight.shape[1],
pretrained_module = original_layer,
lora_r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
).to(original_layer.weight.device)
return lora_layer

def convert_gpt2_attention_to_lora(
model: transformers.GPT2PreTrainedModel,
lora_r=0,
lora_alpha=1.,
lora_dropout=0.,
) -> transformers.GPT2PreTrainedModel:
if not isinstance(model, transformers.GPT2PreTrainedModel):
raise TypeError("Requires a GPT2 model")

if not hasattr(model, "h") and hasattr(model, "transformer"):
transformer = model.transformer
else:
transformer = model

for h_i in transformer.h:
new_layer = MYDPMergedLinear.from_transformers_conv1d(
original_layer=h_i.attn.c_attn,
lora_r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
)
h_i.attn.c_attn = new_layer

return model

def mutate_model(model: torch.nn.Module, lora_r=0, lora_alpha=1., lora_dropout=0.):
for name, module in model.named_children():
if name == "c_attn":
new_layer = MYDPMergedLinear.from_transformers_conv1d(
original_layer=module,
lora_r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
)
setattr(model, name, new_layer)
else:
mutate_model(module, lora_r, lora_alpha, lora_dropout) # recursively call the function on the module


def mark_only_lora_as_trainable(model: torch.nn.Module) -> None:
model.requires_grad_(True)
for n, p in model.named_parameters():
if 'lora_' not in n:
p.requires_grad = False


class AdapterLayer(nn.Module):
def __init__(
self,
emb_dim: int,
bottleneck_size: int,
bias = True
):
super().__init__()

self.sharif_llm_adapter = nn.Sequential(
nn.Linear(emb_dim, bottleneck_size, bias=bias),
nn.ReLU(),
nn.Linear(bottleneck_size, emb_dim, bias=bias)
)

def forward(self, x: torch.Tensor):
output = x + self.sharif_llm_adapter(x)
return output

class FeedForwardAdapterWrapper(nn.Module):
def __init__(
self,
original_module: GPT2MLP,
bottleneck_size: int,
emb_dim,
bias = True
):

super().__init__()

assert isinstance(original_module, GPT2MLP)

self.original_module = original_module
self.adapter = AdapterLayer(emb_dim, bottleneck_size, bias=bias)

def forward(self, x: torch.Tensor):
output = self.original_module(x)
output = self.adapter(output)
return output
def mutate_model_recursive_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True):
for name, module in model.named_children():
if isinstance(module, GPT2MLP):
feed_forward_with_adapter = FeedForwardAdapterWrapper(module, bottleneck_size, emb_dim, bias)
setattr(model, name, feed_forward_with_adapter)
else:
mutate_model_recursive_adapter(module, bottleneck_size, emb_dim, bias) # recursively call the function on the module

def mutate_model_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True):
if hasattr(model, '_mutated'):
print("Model already contains adapter layers! \n Try reloading the model.")
return

mutate_model_recursive_adapter(model, bottleneck_size, emb_dim, bias)
model._mutated = True

+ 211
- 0
Generation/train.py View File

@@ -0,0 +1,211 @@
from transformers import get_linear_schedule_with_warmup
import logging
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm import tqdm
from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager


import wandb
import math
from model import save_model
from torch.optim.lr_scheduler import StepLR


class Trainer:
def __init__(self, cfg, model, train_loader, checkpoint=None, second_trainer=False):
if second_trainer:
self.epochs = cfg.epochs_two
self.lr = cfg.lr_two
self.weight_decay = cfg.weight_decay_two
else:
self.epochs = cfg.epochs
self.lr = cfg.lr
self.weight_decay = cfg.weight_decay
self.optimizer = AdamW(model.parameters(), lr=self.lr, weight_decay=self.weight_decay, eps=cfg.optimizer_eps)
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size
total_steps = len(train_loader) * self.gradient_accumulation_steps * self.epochs
if cfg.scheduler:
if cfg.scheduler_type == "linear":
warmup_steps = cfg.scheduler_warmup_steps if cfg.scheduler_warmup_steps else cfg.scheduler_warmup_ratio*total_steps
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
elif cfg.scheduler_type == "steplr":
self.scheduler = StepLR(self.optimizer, step_size=cfg.scheduler_step_size, gamma=cfg.scheduler_gamma)
self.dp = cfg.dp
self.model = model
self.cfg = cfg
self.save_path = f"{cfg.media_path}generation_saved_models/{cfg.dataset}/{cfg.peft_mode}"
self.model_name = self.cfg.run_name if self.cfg.run_name else "best_model"

if cfg.dp:
self.model.train()
self.privacy_engine = PrivacyEngine(
accountant="rdp",
)
if checkpoint:
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model)
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon(
module=self.model,
optimizer=self.optimizer,
data_loader=train_loader,
target_epsilon=cfg.epsilon,
target_delta=cfg.delta,
epochs=self.epochs,
max_grad_norm=cfg.clipping_threshold,
)

def train_step(self, train_loader):
train_loss = 0

self.model.train()
self.optimizer.zero_grad()

if self.dp:
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader:
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)):
# Move batch tensors to the same device as the model
batch = prepare_inputs(batch)
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}
# Forward pass
outputs = self.model(**batch)
loss = outputs.loss
loss.backward()
train_loss += loss.item()
self.optimizer.step()
self.optimizer.zero_grad()
if self.cfg.scheduler and self.cfg.scheduler_type == "linear":
self.scheduler.step()

if self.cfg.scheduler and self.cfg.scheduler_type == "steplr":
self.scheduler.step()
else:
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)):
# Move batch tensors to the same device as the model
batch = prepare_inputs(batch)
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}
# Forward pass
outputs = self.model(**batch)
loss = outputs.loss
loss.backward()
train_loss += loss.item()
self.optimizer.step()
self.optimizer.zero_grad()
if self.cfg.scheduler and self.cfg.scheduler_type == "linear":
self.scheduler.step()

if self.cfg.scheduler and self.cfg.scheduler_type == "steplr":
self.scheduler.step()

return train_loss/len(train_loader)

def evaluate_step(self, val_loader):
# Evaluation loop
val_loss = 0
self.model.eval()
with torch.no_grad():
for batch in tqdm(val_loader):
# Move batch tensors to the same device as the model
batch = prepare_inputs(batch)
batch = {k: v.to(self.cfg.device) for k, v in batch.items()}

outputs = self.model(**batch)
loss = compute_loss_per_input(outputs, batch)
val_loss += loss.mean().item()
return val_loss/len(val_loader)

def train_and_evaluate(self, epochs, train_loader, val_loader):
best_validation_loss = None
best_epoch = 0

wandb_log = []

for epoch in range(epochs):
log_data = {}
train_loss = self.train_step(train_loader)
log_data["train_loss"] = train_loss
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}")
val_loss = self.evaluate_step(val_loader=val_loader)
log_data["validation_loss"] = val_loss
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}")
if best_validation_loss is None or val_loss < best_validation_loss:
best_validation_loss = val_loss
best_epoch = epoch
save_model(self.model, self.cfg.peft_mode, self.save_path, self.model_name)
logging.info(f"Model improved and saved for epoch {epoch+1}")
wandb_log.append(log_data)
logging.info("Best results:")
if self.cfg.dp:
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta))
logging.info(f"Best validatin loss: {best_validation_loss} for Epoch: {best_epoch+1}")

if self.cfg.use_wandb:
for i, epoch_data in enumerate(wandb_log):
wandb.log(epoch_data)


def prepare_inputs(batch):
batch.pop('src_attn', None)
batch.pop('tgt_attn', None)
batch.pop('src', None)
return batch

def compute_loss_per_input(outputs, batch):
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = batch["labels"][..., 1:].contiguous()
seq_lens = (shift_labels != -100).sum(dim=1)
loss = F.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none")
loss = loss.sum(dim=1) / seq_lens
return loss

def save_evaluation_output(outputs, path):
with open(path, "w") as file:
for strings in outputs:
for string in strings:
file.write(string + "\n")
# file.write("\n")
file.close()

def generate_evaluation_output(model, tokenizer, data, device, max_length, beam_size=5, do_sample=False, num_return_sequences=1):
generated_texts = []

prev = None

for entry in tqdm(data):
if prev != entry["meaning_representation"]:
prev = entry["meaning_representation"]
prompt = f"{entry['meaning_representation']} {tokenizer.eos_token}"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
with torch.no_grad():
outputs = model.generate(**inputs,
num_beams=beam_size,
max_length=max_length,
do_sample=do_sample,
early_stopping=True,
min_length=5,
num_return_sequences=num_return_sequences,
bad_words_ids = [[628], [198], [tokenizer.pad_token_id]],
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1,
top_k=0,
top_p=0.9)
temp_generated_texts = []
for output in outputs:
generated_text = tokenizer.decode(output[len(inputs["input_ids"][0]):], skip_special_tokens=True)
temp_generated_texts.append(generated_text.strip())
generated_texts.append(temp_generated_texts)
return generated_texts

+ 42
- 0
Generation/utils.py View File

@@ -0,0 +1,42 @@
import torch

def clean_hyperparameters(hyperparameters: dict):
if hyperparameters["scheduler"] == 0:
hyperparameters.pop("scheduler_type", None)
hyperparameters.pop("scheduler_warmup_ratio", None)
hyperparameters.pop("scheduler_warmup_steps", None)
hyperparameters.pop("scheduler_step_size", None)
hyperparameters.pop("scheduler_gamma", None)
if hyperparameters["peft_mode"] != "lora":
hyperparameters.pop("rank", None)
hyperparameters.pop("alpha", None)
hyperparameters.pop("drop_out", None)
if hyperparameters["peft_mode"] != "adapter" and hyperparameters["peft_mode"] != "adapterbitfit":
hyperparameters.pop("reduction_factor", None)
if hyperparameters["dp"] == 0:
hyperparameters.pop("epsilon", None)
hyperparameters.pop("delta", None)
hyperparameters.pop("clipping_mode", None)
hyperparameters.pop("clipping_threshold", None)
if hyperparameters["use_wandb"] == 0:
hyperparameters.pop("wandb_project_name", None)
hyperparameters.pop("use_wandb", None)
if hyperparameters["two_step_training"] == 0:
hyperparameters.pop("lr_two", None)
hyperparameters.pop("virtual_batch_size_two", None)
hyperparameters.pop("epochs_two", None)
hyperparameters.pop("weight_decay_two", None)
hyperparameters.pop("f", None)
hyperparameters.pop("media_path", None)
hyperparameters.pop("model_cache_path", None)
return hyperparameters


def copy_model_weights(model1, model2):
model1.eval()
model2.eval()
params1 = model1.parameters()
params2 = model2.parameters()
with torch.no_grad():
for param1, param2 in zip(params1, params2):
param2.data.copy_(param1.data)

+ 3
- 0
README.md View File

@@ -0,0 +1,3 @@
# Privacy-Preserving Fine-tuning of Parameter-Efficient Language Models

Details comming soon!

Loading…
Cancel
Save