@@ -0,0 +1,3 @@ | |||
# Senetnece Classification Task on GLUE Benchmark | |||
Details comming soom! |
@@ -0,0 +1,52 @@ | |||
import argparse | |||
import torch | |||
from media import media_path | |||
class Config: | |||
def __init__(self): | |||
self.parser = argparse.ArgumentParser() | |||
self.add_arguments() | |||
self.args = self.parse() | |||
self.post_process() | |||
def parse(self): | |||
return self.parser.parse_args() | |||
def add_arguments(self): | |||
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||
self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||
self.parser.add_argument('--batch_size', type=int, default=16, help='batch size for training ') | |||
self.parser.add_argument('--virtual_batch_size', type=int, default=16, help='batch size for updating model parameters') | |||
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||
self.parser.add_argument('--optimizer_eps', type=float, default=1e-8, help='optimizer eps') | |||
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||
self.parser.add_argument('--max_length', type=int, default=128, help='Max length for tokenization') | |||
self.parser.add_argument('--peft_mode', type=str, default='lora', choices=['lora', 'bitfit', 'full', 'lorabitfit'], help='PEFT mode for fine-tuning') | |||
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||
self.parser.add_argument('--dataset', type=str, default='sst2', choices=['sst2', 'mnli', 'qqp', 'qnli'], help='Dataset name') | |||
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||
self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||
self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||
self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||
self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes lora then bitfit") | |||
def post_process(self): | |||
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||
self.args.media_path = media_path |
@@ -0,0 +1,107 @@ | |||
from config import Config | |||
from src.model import prepare_model | |||
from src.data import prepare_data | |||
from src.train import Trainer | |||
import os | |||
import random | |||
import numpy as np | |||
import torch | |||
import wandb | |||
import logging | |||
import transformers | |||
import warnings | |||
warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes ") | |||
transformers.logging.set_verbosity_error() | |||
def set_seeds(seed: int): | |||
os.environ['PYTHONHASHSEED'] = str(seed) | |||
random.seed(seed) | |||
np.random.seed(seed) | |||
torch.manual_seed(seed) | |||
torch.cuda.manual_seed(seed) | |||
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||
transformers.set_seed(seed) | |||
def copy_model_weights(model1, model2): | |||
model1.eval() | |||
model2.eval() | |||
params1 = model1.parameters() | |||
params2 = model2.parameters() | |||
with torch.no_grad(): | |||
for param1, param2 in zip(params1, params2): | |||
param2.data.copy_(param1.data) | |||
# Returns number of trainbale parameters of the model | |||
def get_number_of_trainable_parameters(model): | |||
return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||
# Returns number of parameters of the model | |||
def get_number_of_parameters(model): | |||
return sum(p.numel() for p in model.parameters()) | |||
def main(cfg): | |||
set_seeds(cfg.seed) | |||
model, tokenizer = prepare_model(cfg) | |||
num_of_all_params = get_number_of_parameters(model) | |||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
train_loader, val_loader_one, val_loader_two = prepare_data(cfg, tokenizer) | |||
logging.info("Data is ready") | |||
trainer = Trainer(cfg, model, train_loader) | |||
trainer.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||
if cfg.two_step_training: | |||
if cfg.dp: | |||
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||
model_two, _ = prepare_model(cfg) | |||
copy_model_weights(model, model_two) | |||
del model | |||
model = model_two | |||
for a, b in model.roberta.named_parameters(): | |||
if 'bias' in a: | |||
b.requires_grad = True | |||
else: | |||
b.requires_grad = False | |||
logging.info("New Model adjusted") | |||
num_of_all_params = get_number_of_parameters(model) | |||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
trainer_two = Trainer(cfg, model, train_loader, checkpoint="temp.pth") | |||
trainer_two.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||
if cfg.use_wandb: | |||
wandb.finish() | |||
if __name__ == "__main__": | |||
cfg = Config().args | |||
log_path = "logs/" | |||
if not os.path.exists(log_path): | |||
os.makedirs(log_path) | |||
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||
if cfg.use_wandb: | |||
wandb.login(key="YOUR_KEY") | |||
if cfg.run_name: | |||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||
else: | |||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||
log_file_name = wandb.run.name | |||
logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||
logging.info("Start of the logging") | |||
hyperparameters = {key: value for key, value in vars(cfg).items()} | |||
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||
logging.info("config:\n" + hyperparameters_str) | |||
main(cfg) |
@@ -0,0 +1 @@ | |||
media_path = "YOUR_MEDIA_PATH" |
@@ -0,0 +1,45 @@ | |||
from datasets import load_from_disk | |||
from torch.utils.data import DataLoader | |||
from torch.utils.data import WeightedRandomSampler | |||
TASK_TO_KEYS = { | |||
"mnli": ("premise", "hypothesis"), | |||
"qnli": ("question", "sentence"), | |||
"qqp": ("question1", "question2"), | |||
"sst2": ("sentence", None), | |||
} | |||
def prepare_data(cfg, tokenizer): | |||
dataset = load_from_disk(f"{cfg.media_path}saved_datasets/{cfg.dataset}") | |||
sentence1_key, sentence2_key = TASK_TO_KEYS[cfg.dataset] | |||
if cfg.toy_example: | |||
dataset["train"] = dataset["train"].select(range(1024)) | |||
def tokenize(batch): | |||
args = ((batch[sentence1_key],) if sentence2_key is None else (batch[sentence1_key], batch[sentence2_key])) | |||
return tokenizer(*args, padding="max_length", truncation=True, max_length=cfg.max_length) | |||
dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset)) | |||
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) | |||
cfg.train_data_size = len(dataset['train']) | |||
sampler = WeightedRandomSampler([cfg.virtual_batch_size/cfg.train_data_size for _ in range(cfg.train_data_size)], num_samples=cfg.train_data_size, replacement=True) | |||
train_loader = DataLoader(dataset['train'], batch_size=cfg.virtual_batch_size, sampler=sampler, drop_last=True) | |||
validation_loader_one = None | |||
validation_loader_two = None | |||
if cfg.dataset == "mnli": | |||
if cfg.toy_example: | |||
dataset["validation_matched"] = dataset["validation_matched"].select(range(100)) | |||
dataset["validation_mismatched"] = dataset["validation_mismatched"].select(range(100)) | |||
validation_loader_one = DataLoader(dataset['validation_matched'], batch_size=cfg.batch_size) | |||
validation_loader_two = DataLoader(dataset['validation_mismatched'], batch_size=cfg.batch_size) | |||
else: | |||
if cfg.toy_example: | |||
dataset["validation"] = dataset["validation"].select(range(100)) | |||
validation_loader_one = DataLoader(dataset['validation'], batch_size=cfg.batch_size) | |||
return train_loader, validation_loader_one, validation_loader_two |
@@ -0,0 +1,91 @@ | |||
from transformers import RobertaForSequenceClassification, RobertaTokenizer | |||
import logging | |||
import torch | |||
def prepare_model(cfg): | |||
tokenizer = RobertaTokenizer.from_pretrained(f"{cfg.media_path}models/roberta-large-tokenizer") | |||
model = RobertaForSequenceClassification.from_pretrained(f"{cfg.media_path}models/roberta-large-model") | |||
if cfg.dataset == 'mnli': | |||
model.classifier.out_proj = torch.nn.Linear(model.classifier.out_proj.in_features, 3, bias=True) | |||
# adjust model parameters | |||
if cfg.peft_mode == "lora": | |||
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||
freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||
logging.info("LoRA model loaded") | |||
elif cfg.peft_mode == "bitfit": | |||
for a, b in model.roberta.named_parameters(): | |||
if not 'bias' in a: | |||
b.requires_grad = False | |||
logging.info("BiTFiT model loaded") | |||
elif cfg.peft_mode == "lorabitfit": | |||
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||
freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||
if cfg.two_step_training == 0: | |||
for a, b in model.roberta.named_parameters(): | |||
if 'bias' in a: | |||
b.requires_grad = True | |||
logging.info("LoRA and BiTFiT combined model loaded") | |||
elif cfg.peft_mode == "full": | |||
logging.info("Full model loaded") | |||
else: | |||
logging.info("No acceptable model to load") | |||
model.to(cfg.device) | |||
return model, tokenizer | |||
class LoRALayer(torch.nn.Module): | |||
def __init__( | |||
self, | |||
module: torch.nn.Linear, | |||
rank: int , | |||
alpha: float | |||
): | |||
super().__init__() | |||
self.rank = rank | |||
self.alpha = alpha | |||
self.scaling = self.alpha / self.rank # scaling factor | |||
self.in_dim = module.in_features | |||
self.out_dim = module.out_features | |||
self.pretrained = module | |||
self.sharif_llm_A = torch.nn.Linear(self.in_dim, self.rank, bias=False) | |||
torch.nn.init.kaiming_normal_(self.sharif_llm_A.weight) | |||
self.sharif_llm_B = torch.nn.Linear(self.rank, self.out_dim, bias=False) | |||
torch.nn.init.zeros_(self.sharif_llm_B.weight) | |||
def forward(self, x: torch.Tensor): | |||
pretrained_out = self.pretrained(x) | |||
lora_out = self.sharif_llm_A(x) # x@A | |||
lora_out = self.sharif_llm_B(lora_out) # x@A@B | |||
lora_out = self.scaling * lora_out # Scale by the scaling factor | |||
return pretrained_out + lora_out # x@W + x@A@B*(scaling_factor) | |||
def mutate_model(model: torch.nn.Module, rank: int, alpha: float): | |||
""" | |||
Replaces all linear layers in the model with LoRALinear layers. | |||
Freeze all params except LoRA params. | |||
""" | |||
# make sure there are no LoRALayer is in the model; return if there are any | |||
for name, module in model.named_modules(): | |||
if isinstance(module, LoRALayer): | |||
logging.info("Model already contains LoRALinear layers! \n Try reloading the model.") | |||
return | |||
# we want to replace all query and value Linear modules with LoRALayer | |||
for name, module in model.named_children(): | |||
# if the module is linear and the name is for query or value | |||
if isinstance(module, torch.nn.Linear) and (name == 'query' or name == 'value'): | |||
# replace the module with LoRALayer | |||
lora_layer = LoRALayer(module, rank, alpha) | |||
setattr(model, name, lora_layer) | |||
else: | |||
mutate_model(module, rank, alpha) # recursively call the function on the module | |||
def freeze_non_LoRA(model, peft_key): | |||
for param_name, weights in model.named_parameters(): | |||
weights.requires_grad = peft_key in param_name |
@@ -0,0 +1,147 @@ | |||
from transformers import get_linear_schedule_with_warmup | |||
import logging | |||
import torch | |||
from torch.nn import CrossEntropyLoss | |||
from torch.optim import AdamW | |||
from tqdm import tqdm | |||
from sklearn.metrics import accuracy_score | |||
import wandb | |||
import math | |||
from opacus import PrivacyEngine | |||
from opacus.utils.batch_memory_manager import BatchMemoryManager | |||
class Trainer: | |||
def __init__(self, cfg, model, train_loader, checkpoint=None): | |||
self.criterion = CrossEntropyLoss() | |||
self.val_criterion = CrossEntropyLoss() | |||
self.optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, eps=cfg.optimizer_eps) | |||
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||
total_steps = math.ceil(len(train_loader) / self.gradient_accumulation_steps) * cfg.epochs | |||
if cfg.scheduler: | |||
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=cfg.scheduler_warmup_ratio*total_steps, num_training_steps=total_steps) | |||
self.dp = cfg.dp | |||
self.model = model | |||
self.cfg = cfg | |||
if cfg.dp: | |||
self.model.train() | |||
self.privacy_engine = PrivacyEngine( | |||
accountant="rdp", | |||
) | |||
if checkpoint: | |||
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||
module=self.model, | |||
optimizer=self.optimizer, | |||
data_loader=train_loader, | |||
target_epsilon=cfg.epsilon, | |||
target_delta=cfg.delta, | |||
epochs=cfg.epochs, | |||
max_grad_norm=cfg.clipping_threshold, | |||
) | |||
def train_step(self, train_loader): | |||
train_loss = 0 | |||
self.model.train() | |||
self.optimizer.zero_grad() | |||
if self.cfg.dp: | |||
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
# Move batch tensors to the same device as the model | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
# Forward pass | |||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
loss = self.criterion(outputs.logits, batch["label"]) | |||
loss.backward() | |||
train_loss += loss.mean().item() | |||
self.optimizer.step() | |||
self.optimizer.zero_grad() | |||
if self.cfg.scheduler: | |||
self.scheduler.step() | |||
else: | |||
for batch_number, batch in tqdm(enumerate(train_loader, 1), total=len(train_loader)): | |||
# Move batch tensors to the same device as the model | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
# Forward pass | |||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
loss = self.criterion(outputs.logits, batch["label"]) | |||
loss.backward() | |||
train_loss += loss.mean().item() | |||
self.optimizer.step() | |||
self.optimizer.zero_grad() | |||
if self.cfg.scheduler: | |||
self.scheduler.step() | |||
return train_loss/len(train_loader) | |||
def evaluate_step(self, val_loader): | |||
# Evaluation loop | |||
val_loss = 0 | |||
self.model.eval() | |||
predictions = [] | |||
true_labels = [] | |||
with torch.no_grad(): | |||
for batch in tqdm(val_loader): | |||
# Move batch tensors to the same device as the model | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
# Forward pass and compute validation loss | |||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
loss = self.val_criterion(outputs.logits, batch["label"]) | |||
_, preds = torch.max(outputs.logits, dim=1) | |||
predictions.extend(preds.tolist()) | |||
true_labels.extend(batch["label"].tolist()) | |||
val_loss += loss.item() | |||
accuracy = accuracy_score(true_labels, predictions) | |||
return accuracy , val_loss/len(val_loader) | |||
def train_and_evaluate(self, epochs, train_loader, val_loader_one, val_loader_two): | |||
best_accuracy = 0 | |||
best_accuracy_two = 0 | |||
wandb_log = [] | |||
for epoch in range(epochs): | |||
log_data = {} | |||
train_loss = self.train_step(train_loader) | |||
log_data["train_loss"] = train_loss | |||
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||
accuracy, val_loss = self.evaluate_step(val_loader=val_loader_one) | |||
log_data["validation_loss"] = val_loss | |||
log_data["accuracy"] = accuracy | |||
if accuracy > best_accuracy: | |||
best_accuracy = accuracy | |||
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||
logging.info(f"Accuracy on validation set: {accuracy * 100} %") | |||
if val_loader_two: | |||
accuracy_two , val_loss_two = self.evaluate_step(val_loader=val_loader_two) | |||
log_data["validation_two_loss"] = val_loss_two | |||
log_data["accuracy_two"] = accuracy_two | |||
if accuracy_two > best_accuracy_two: | |||
best_accuracy_two = accuracy_two | |||
logging.info(f"Epoch {epoch+1} Validation two loss: {val_loss_two}") | |||
logging.info(f"Accuracy on validation two set: {accuracy_two * 100} %") | |||
wandb_log.append(log_data) | |||
logging.info("Best results:") | |||
if self.cfg.dp: | |||
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||
logging.info(f"Best validatin accuracy: {best_accuracy}") | |||
if val_loader_two: | |||
logging.info(f"Second validation set accuracy: {best_accuracy_two}") | |||
if self.cfg.use_wandb: | |||
for i, epoch_data in enumerate(wandb_log): | |||
wandb.log(epoch_data) |
@@ -0,0 +1,3 @@ | |||
# Text Generation Task on E2E dataset | |||
Details comming soom! |
@@ -0,0 +1,68 @@ | |||
import argparse | |||
import torch | |||
from media import * | |||
class Config: | |||
def __init__(self): | |||
self.parser = argparse.ArgumentParser() | |||
self.add_arguments() | |||
self.args = self.parse() | |||
self.post_process() | |||
def parse(self): | |||
return self.parser.parse_args() | |||
def add_arguments(self): | |||
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||
# self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||
self.parser.add_argument('--batch_size', type=int, default=8, help='batch size for training') | |||
self.parser.add_argument('--virtual_batch_size', type=int, default=8, help='batch size for updating model parameters') | |||
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||
self.parser.add_argument('--optimizer_eps', type=float, default=1e-6, help='optimizer eps') | |||
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||
self.parser.add_argument("--scheduler_type", type=str, default="linear", choices=['linear', 'steplr'], help="Scheduler types") | |||
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||
self.parser.add_argument('--scheduler_warmup_steps', type=int, default=None, help='Warmup steps can be given directly') | |||
self.parser.add_argument('--scheduler_step_size', type=int, default=1, help='Scheduler step size for stepLR scheduler') | |||
self.parser.add_argument('--scheduler_gamma', type=float, default=0.5, help='Scheduler decrease rate for stepLR scheduler') | |||
self.parser.add_argument('--model_name', type=str, default='gpt2', choices=['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'], help='PEFT mode for fine-tuning') | |||
self.parser.add_argument('--seq_length', type=int, default=128, help='Max length for tokenization') | |||
self.parser.add_argument('--peft_mode', type=str, default='bitfit', choices=['lora', 'bitfit', 'full', 'lorabitfit', 'adapter', 'adapterbitfit'], help='PEFT mode for fine-tuning') | |||
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||
self.parser.add_argument('--drop_out', type=float, default=0.0, help='Dropout for lora') | |||
self.parser.add_argument('--reduction_factor', type=int, default=16, help='Reduction_factor for adapter') | |||
self.parser.add_argument('--dataset', type=str, default='e2e_nlg', choices=['e2e_nlg', 'dart'], help='Dataset name') | |||
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||
self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||
self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||
self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||
self.parser.add_argument("--beam_size", type=int, default=5, help="Number of beans for generation") | |||
self.parser.add_argument('--f', type=str, default=None, help='Path to Jupyter kernel JSON file') | |||
self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes adapter or lora then bitfit") | |||
self.parser.add_argument('--lr_two', type=float, default='2e-3', help='Learning rate for second step of training') | |||
self.parser.add_argument('--virtual_batch_size_two', type=int, default=8, help='batch size for updating model parameters for scond step of training') | |||
self.parser.add_argument('--epochs_two', type=int, default=5, help='Number of epochs for second step training') | |||
self.parser.add_argument('--weight_decay_two', type=float, default=0.1, help='Weight decay for second optimizer') | |||
def post_process(self): | |||
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||
self.args.media_path = media_path | |||
self.args.model_cache_path = model_cache_path |
@@ -0,0 +1,309 @@ | |||
from datasets import load_from_disk | |||
import torch | |||
from torch.utils.data import DataLoader, WeightedRandomSampler | |||
import copy | |||
import sys | |||
import torch | |||
from torch.utils.data.dataset import Dataset | |||
from transformers.tokenization_utils import PreTrainedTokenizer | |||
from dataclasses import dataclass | |||
from typing import Any, Callable, Dict, List, NewType, Tuple, Union | |||
from torch.nn.utils.rnn import pad_sequence | |||
from transformers.tokenization_utils import PreTrainedTokenizer | |||
from transformers.tokenization_utils_base import BatchEncoding | |||
def load_dataset(dataset_name, path, toy_example): | |||
dataset = load_from_disk(f"{path}saved_datasets/{dataset_name}") | |||
# toy example for develop | |||
if toy_example == 1: | |||
dataset["train"] = dataset["train"].select(range(1024)) | |||
dataset["validation"] = dataset["validation"].select(range(512)) | |||
return dataset | |||
def load_dataloaders(dataset, dataset_name, batch_size, virtual_batch_size, tokenizer, seq_length, dp=1): | |||
data_collator = DataCollatorForData2TextLanguageModeling(tokenizer) | |||
if dataset_name == 'e2e_nlg': | |||
train_dataset = E2ETextDataset(tokenizer, | |||
dataset["train"]["meaning_representation"], | |||
dataset["train"]["human_reference"], | |||
seq_length, | |||
tokenizer.bos_token, | |||
tokenizer.eos_token, | |||
seq_length) | |||
validation_dataset = E2ETextDataset(tokenizer, | |||
dataset["validation"]["meaning_representation"], | |||
dataset["validation"]["human_reference"], | |||
seq_length, | |||
tokenizer.bos_token, | |||
tokenizer.eos_token, | |||
seq_length) | |||
train_data_size = len(dataset["train"]) | |||
if dp == 1: | |||
sampler = WeightedRandomSampler([virtual_batch_size/train_data_size for _ in range(train_data_size)], num_samples=train_data_size, replacement=True) | |||
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, sampler=sampler, drop_last=True, collate_fn=data_collator) | |||
else: | |||
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, collate_fn=data_collator) | |||
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator) | |||
elif dataset_name == 'dart': | |||
pass | |||
return train_loader, validation_loader | |||
# Copyright (c) Xuechen Li. All Rights Reserved. | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
class E2ETextDataset(Dataset): | |||
def __init__( | |||
self, | |||
tokenizer: PreTrainedTokenizer, | |||
src_lines, | |||
tgt_lines, | |||
block_size: int, | |||
bos_tok: str, | |||
eos_tok: str, | |||
max_seq_len=sys.maxsize, | |||
max_examples=sys.maxsize, | |||
**_, | |||
): | |||
src_lines = src_lines | |||
tgt_lines = tgt_lines | |||
edited_sents = [] | |||
for src, tgt in zip(src_lines, tgt_lines): | |||
sent = ' {} {} '.format(src, bos_tok) + tgt + ' {}'.format(eos_tok) | |||
edited_sents.append(sent) | |||
# --- Filter out super long sentences --- | |||
new_src_lines, new_tgt_lines, new_edited_sents = [], [], [] | |||
for src_line, tgt_line, edited_sent in zip(src_lines, tgt_lines, edited_sents): | |||
tokenized_edited_sent = tokenizer.tokenize(edited_sent) | |||
if len(tokenized_edited_sent) <= max_seq_len: | |||
new_src_lines.append(src_line) | |||
new_tgt_lines.append(tgt_line) | |||
new_edited_sents.append(edited_sent) | |||
del src_line, tgt_line, edited_sent | |||
src_lines, tgt_lines, edited_sents = new_src_lines, new_tgt_lines, new_edited_sents | |||
# --------------------------------------- | |||
# --- Truncate the dataset if necessary; this must be after the length filtering. --- | |||
src_lines = src_lines[:max_examples] | |||
tgt_lines = tgt_lines[:max_examples] | |||
edited_sents = edited_sents[:max_examples] | |||
# --- | |||
batch_encoding = tokenizer( | |||
edited_sents, | |||
add_special_tokens=True, | |||
truncation=True, | |||
max_length=block_size, | |||
is_split_into_words=False, | |||
) | |||
self.examples = batch_encoding["input_ids"] | |||
self.labels = copy.deepcopy(self.examples) | |||
# split into category words: | |||
ssl_lst = [] | |||
for ss in src_lines: | |||
ssl = [la.split(':')[0].strip() for la in ss.split('|')] | |||
ssl_lst.append(ssl) | |||
self.src_cat = tokenizer( | |||
ssl_lst, | |||
add_special_tokens=True, | |||
truncation=True, | |||
max_length=block_size, | |||
is_split_into_words=True | |||
)['input_ids'] | |||
self.src_sent = [] | |||
self.tgt_sent = [] | |||
# temp_src_len = 0 | |||
# temp_tgt_len = 0 | |||
# temp_count = 0 | |||
separator = tokenizer(bos_tok, add_special_tokens=False)['input_ids'][0] | |||
for i, elem in enumerate(self.labels): | |||
sep_idx = elem.index(separator) + 1 | |||
self.src_sent.append(self.examples[i][:sep_idx - 1]) | |||
self.tgt_sent.append(self.examples[i][sep_idx - 1:]) | |||
self.labels[i][:sep_idx] = [-100] * sep_idx # Doesn't contribute to loss. | |||
# temp_src_len += sep_idx - 1 | |||
# temp_tgt_len += len(elem) - (sep_idx - 1) | |||
# temp_count += 1 | |||
# print('tgt_avg: ', temp_tgt_len / temp_count) | |||
# print('src_avg: ', temp_src_len / temp_count) | |||
# print('ratios: ', temp_src_len / temp_tgt_len) | |||
# print(self.labels[0]) | |||
# print(self.examples[0]) | |||
# print(edited_sents[0]) | |||
# print(self.src_sent[0]) | |||
# print(self.tgt_sent[0]) | |||
# print(self.src_cat[0]) | |||
assert len(self.src_cat) == len(self.examples) | |||
def __len__(self): | |||
return len(self.examples) | |||
def __getitem__(self, i): | |||
return ( | |||
torch.tensor(self.examples[i], dtype=torch.long), | |||
torch.tensor(self.labels[i], dtype=torch.long), | |||
torch.tensor(self.src_sent[i], dtype=torch.long), | |||
torch.tensor(self.tgt_sent[i], dtype=torch.long), | |||
torch.tensor(self.src_cat[i], dtype=torch.long), | |||
) | |||
# InputDataClass = NewType("InputDataClass", Any) | |||
""" | |||
A DataCollator is a function that takes a list of samples from a Dataset | |||
and collate them into a batch, as a dictionary of Tensors. | |||
""" | |||
# DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]]) | |||
@dataclass | |||
class DataCollatorForData2TextLanguageModeling: | |||
""" | |||
Data collator used for language modeling. | |||
- collates batches of tensors, honoring their tokenizer's pad_token | |||
- preprocesses batches for masked language modeling | |||
""" | |||
tokenizer: PreTrainedTokenizer | |||
mlm: bool = False | |||
format_mode: str = 'cat' | |||
mlm_probability: float = 0.15 | |||
def __call__( | |||
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||
) -> Dict[str, torch.Tensor]: | |||
if isinstance(examples[0], (dict, BatchEncoding)): | |||
examples = [e["input_ids"] for e in examples] | |||
input_ids, labels, src, tgt, cate = zip(*examples) | |||
if self.mlm: | |||
inputs, labels = self.mask_tokens(batch) | |||
return {"input_ids": inputs, "labels": labels} | |||
else: | |||
if self.format_mode == 'cat': | |||
mode_input = 3 | |||
elif self.format_mode == 'peek': | |||
mode_input = 1 | |||
elif self.format_mode == 'nopeek': | |||
mode_input = 2 | |||
elif self.format_mode == 'infix': | |||
mode_input = 4 | |||
# mode_input = 1 # means that we take the input again. | |||
# mode_input = 2 # means that we do not peek at src again. | |||
# mode_input = 3 # means that we look at the categories, and see the input again. | |||
if mode_input == 1: | |||
# input, batch | |||
batch = self._tensorize_batch(input_ids) | |||
labels = self._tensorize_batch(labels) | |||
src = self._tensorize_batch(src) | |||
cate_batch, cate_attn = None, None | |||
# tgt = self._tensorize_batch(tgt) | |||
elif mode_input == 2: | |||
# nopeek. | |||
batch = self._tensorize_batch(tgt) | |||
labels = batch.clone() | |||
src = self._tensorize_batch(src) | |||
cate_batch, cate_attn = None, None | |||
elif mode_input == 3: | |||
batch = self._tensorize_batch(input_ids) | |||
labels = self._tensorize_batch(labels) | |||
src = self._tensorize_batch(cate) | |||
cate_batch, cate_attn = None, None | |||
elif mode_input == 4: | |||
batch = self._tensorize_batch(tgt) | |||
labels = batch.clone() | |||
src = self._tensorize_batch(src) | |||
cate_batch = self._tensorize_batch(cate) | |||
cate_attn = (cate_batch != self.tokenizer.pad_token_id) | |||
labels[labels == self.tokenizer.pad_token_id] = -100 # tgt | |||
src_attn = (src != self.tokenizer.pad_token_id) # src | |||
tgt_attn = (batch != self.tokenizer.pad_token_id) # tgt | |||
if cate_batch is None: | |||
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn':tgt_attn, | |||
'src':src} | |||
else: | |||
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn': tgt_attn, | |||
'src': src, "cate_batch":cate_batch, "cate_attn":cate_attn} | |||
def _tensorize_batch( | |||
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||
) -> torch.Tensor: | |||
# In order to accept both lists of lists and lists of Tensors | |||
if isinstance(examples[0], (list, tuple)): | |||
examples = [torch.tensor(e, dtype=torch.long) for e in examples] | |||
length_of_first = examples[0].size(0) | |||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) | |||
if are_tensors_same_length: | |||
return torch.stack(examples, dim=0) | |||
else: | |||
if self.tokenizer._pad_token is None: | |||
raise ValueError( | |||
"You are attempting to pad samples but the tokenizer you are using" | |||
f" ({self.tokenizer.__class__.__name__}) does not have one." | |||
) | |||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) | |||
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: | |||
""" | |||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. | |||
""" | |||
if self.tokenizer.mask_token is None: | |||
raise ValueError( | |||
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." | |||
) | |||
labels = inputs.clone() | |||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) | |||
probability_matrix = torch.full(labels.shape, self.mlm_probability) | |||
special_tokens_mask = [ | |||
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() | |||
] | |||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) | |||
if self.tokenizer._pad_token is not None: | |||
padding_mask = labels.eq(self.tokenizer.pad_token_id) | |||
probability_matrix.masked_fill_(padding_mask, value=0.0) | |||
masked_indices = torch.bernoulli(probability_matrix).bool() | |||
labels[~masked_indices] = -100 # We only compute loss on masked tokens | |||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) | |||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices | |||
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) | |||
# 10% of the time, we replace masked input tokens with random word | |||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced | |||
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) | |||
inputs[indices_random] = random_words[indices_random] | |||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged | |||
return inputs, labels |
@@ -0,0 +1,125 @@ | |||
from config import Config | |||
import os | |||
import random | |||
import numpy as np | |||
import torch | |||
import wandb | |||
import logging | |||
import transformers | |||
import warnings | |||
import subprocess | |||
from model import load_model, prepare_model, get_number_of_trainable_parameters, load_model_weights, get_number_of_parameters | |||
from data import load_dataset, load_dataloaders | |||
from train import Trainer, generate_evaluation_output, save_evaluation_output | |||
from utils import clean_hyperparameters, copy_model_weights | |||
warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes") | |||
transformers.logging.set_verbosity_error() | |||
def set_seeds(seed: int): | |||
os.environ['PYTHONHASHSEED'] = str(seed) | |||
random.seed(seed) | |||
np.random.seed(seed) | |||
torch.manual_seed(seed) | |||
torch.cuda.manual_seed(seed) | |||
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||
transformers.set_seed(seed) | |||
def run_metric_script(file_path): | |||
result = subprocess.run(["e2e/measure_scores.py", "-p", "e2e_ref.txt", file_path], stdout=subprocess.PIPE) | |||
output = result.stdout.decode('utf-8') | |||
lines = output.split('\n') | |||
return lines[-7:-2] | |||
def main(cfg): | |||
set_seeds(cfg.seed) | |||
model, tokenizer = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||
model = prepare_model(model, cfg) | |||
num_of_all_params = get_number_of_parameters(model) | |||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
logging.info(f"Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
dataset = load_dataset(cfg.dataset, cfg.media_path, cfg.toy_example) | |||
cfg.train_data_size = len(dataset["train"]) | |||
# dataset = tokenize_dataset(tokenizer, dataset, cfg.dataset, cfg.seq_length) | |||
train_loader, validation_loader = load_dataloaders(dataset, cfg.dataset, cfg.batch_size, cfg.virtual_batch_size, tokenizer, cfg.seq_length, cfg.dp) | |||
logging.info("Dataset loaded and tokenized") | |||
trainer = Trainer(cfg, model, train_loader) | |||
trainer.train_and_evaluate(cfg.epochs, train_loader, validation_loader) | |||
if cfg.two_step_training and cfg.dp: | |||
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||
model_two, _ = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||
model_two = prepare_model(model_two, cfg) | |||
copy_model_weights(model, model_two) | |||
del model | |||
model = model_two | |||
for a, b in model.named_parameters(): | |||
if 'bias' in a and not 'adapter' in a: | |||
b.requires_grad = True | |||
else: | |||
b.requires_grad = False | |||
logging.info("New Model adjusted") | |||
num_of_all_params = get_number_of_parameters(model) | |||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
trainer_two = Trainer(cfg, model, train_loader, second_trainer=True) | |||
trainer_two.train_and_evaluate(cfg.epochs_two, train_loader, validation_loader) | |||
# evaluate model on test data | |||
model.eval() | |||
model = load_model_weights(model, cfg.peft_mode, f"{trainer.save_path}/{trainer.model_name}.pth") | |||
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size) | |||
output_path = f"{cfg.media_path}generation_eval_outputs/{cfg.dataset}/{cfg.peft_mode}" | |||
if not os.path.exists(output_path): | |||
os.makedirs(output_path) | |||
output_name = cfg.run_name if cfg.run_name else "generation_output" | |||
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v1.txt") | |||
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size, do_sample=True) | |||
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v2.txt") | |||
logging.info("Generation for test data saved") | |||
metrics = run_metric_script(f"{output_path}/{output_name}-v1.txt") | |||
logging.info("Metrics without sampling:") | |||
for metric in metrics: | |||
logging.info(metric) | |||
metrics = run_metric_script(f"{output_path}/{output_name}-v2.txt") | |||
logging.info("Metrics with sampling:") | |||
for metric in metrics: | |||
logging.info(metric) | |||
if cfg.use_wandb: | |||
wandb.finish() | |||
if __name__ == "__main__": | |||
cfg = Config().args | |||
log_path = f"logs/{cfg.dataset}/{cfg.peft_mode}/" | |||
if not os.path.exists(log_path): | |||
os.makedirs(log_path) | |||
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||
if cfg.use_wandb: | |||
wandb.login(key="YOUR_KEY") | |||
if cfg.run_name: | |||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||
else: | |||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||
log_file_name = wandb.run.name | |||
logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||
logging.info("Start of the logging") | |||
hyperparameters = {key: value for key, value in vars(cfg).items()} | |||
hyperparameters = clean_hyperparameters(hyperparameters) | |||
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||
logging.info("config:\n" + hyperparameters_str) | |||
main(cfg) |
@@ -0,0 +1,2 @@ | |||
media_path = "YOUR_MEDIA_PATH" | |||
model_cache_path = "YOUR_CACHE_PATH" |
@@ -0,0 +1,299 @@ | |||
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel | |||
import torch | |||
import transformers | |||
from torch import nn | |||
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP | |||
import os | |||
# Loads model and its tokenizer | |||
def load_model(model_name, cache_dir="."): | |||
tokenizer = GPT2Tokenizer.from_pretrained(f"{cache_dir}gpt2/{model_name}-tokenizer") | |||
model = GPT2LMHeadModel.from_pretrained(f"{cache_dir}gpt2/{model_name}-model") | |||
add_pad_token(model, tokenizer) | |||
model.requires_grad_(False) | |||
return model, tokenizer | |||
# Adds padding token to the tokenizer and model embedding layer | |||
def add_pad_token(model, tokenizer): | |||
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |||
model.resize_token_embeddings(len(tokenizer)) | |||
a = model.get_input_embeddings().weight | |||
a.data[-1] = a.data[:-1].mean(dim=0) | |||
# Returns number of trainbale parameters of the model | |||
def get_number_of_trainable_parameters(model): | |||
return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||
# Returns number of parameters of the model | |||
def get_number_of_parameters(model): | |||
return sum(p.numel() for p in model.parameters()) | |||
# Mutates model structure and adjusts trainable parameters | |||
def prepare_model(model, cfg): | |||
if cfg.peft_mode == 'bitfit': | |||
for a, b in model.named_parameters(): | |||
if 'bias' in a: | |||
b.requires_grad = True | |||
elif cfg.peft_mode == 'lora': | |||
model.requires_grad_(True) | |||
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||
mark_only_lora_as_trainable(model) | |||
elif cfg.peft_mode == 'lorabitfit': | |||
model.requires_grad_(True) | |||
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||
mark_only_lora_as_trainable(model) | |||
if cfg.two_step_training == 0: | |||
for a, b in model.named_parameters(): | |||
if 'bias' in a: | |||
b.requires_grad = True | |||
elif cfg.peft_mode == 'full': | |||
model.requires_grad_(True) | |||
elif cfg.peft_mode == 'adapter': | |||
model.requires_grad_(False) | |||
bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||
mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||
for a, b in model.named_parameters(): | |||
if 'adapter' in a: | |||
b.requires_grad = True | |||
elif cfg.peft_mode == 'adapterbitfit': | |||
model.requires_grad_(False) | |||
bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||
mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||
if cfg.two_step_training == 0: | |||
for a, b in model.named_parameters(): | |||
if 'adapter' in a or 'bias' in a: | |||
b.requires_grad = True | |||
else: | |||
for a, b in model.named_parameters(): | |||
if 'adapter' in a: | |||
b.requires_grad = True | |||
model.to(cfg.device) | |||
return model | |||
def save_model(model, peft_mode, save_path, model_name): | |||
if not os.path.exists(save_path): | |||
os.makedirs(save_path) | |||
if peft_mode == "bitfit": | |||
bias_params = {} | |||
for name, param in model.named_parameters(): | |||
if 'bias' in name: | |||
bias_params[name] = param.data.clone() | |||
torch.save(bias_params, f'{save_path}/{model_name}.pth') | |||
elif peft_mode == 'lora': | |||
lora_params = {} | |||
for name, param in model.named_parameters(): | |||
if 'lora' in name: | |||
lora_params[name] = param.data.clone() | |||
torch.save(lora_params, f'{save_path}/{model_name}.pth') | |||
elif peft_mode == 'lorabitfit': | |||
lorabitfit_params = {} | |||
for name, param in model.named_parameters(): | |||
if 'lora' in name or 'bias' in name: | |||
lorabitfit_params[name] = param.data.clone() | |||
torch.save(lorabitfit_params, f'{save_path}/{model_name}.pth') | |||
elif peft_mode == 'full': | |||
pass | |||
elif peft_mode == 'adapter': | |||
adapter_params = {} | |||
for name, param in model.named_parameters(): | |||
if 'adapter' in name: | |||
adapter_params[name] = param.data.clone() | |||
torch.save(adapter_params, f'{save_path}/{model_name}.pth') | |||
elif peft_mode == 'adapterbitfit': | |||
adapterbitfit_params = {} | |||
for name, param in model.named_parameters(): | |||
if 'adapter' in name or 'bias' in name: | |||
adapterbitfit_params[name] = param.data.clone() | |||
torch.save(adapterbitfit_params, f'{save_path}/{model_name}.pth') | |||
def load_model_weights(model, peft_mode, path): | |||
if peft_mode == 'full': | |||
pass | |||
else: | |||
model_weights = torch.load(path) | |||
with torch.no_grad(): | |||
for name, param in model.named_parameters(): | |||
if name in model_weights: | |||
param.copy_(model_weights[name]) | |||
return model | |||
# Copyright (c) Xuechen Li. All Rights Reserved. | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
""" | |||
LoRA layers. | |||
This version does not have merged weights for zero latency inference. It makes the code easier to read and maintain. | |||
Adapted from | |||
https://github.com/microsoft/LoRA | |||
https://www.microsoft.com/en-us/research/project/dp-transformers/ | |||
""" | |||
class MYDPMergedLinear(nn.Module): | |||
def __init__( | |||
self, | |||
in_features: int, | |||
out_features: int, | |||
pretrained_module, | |||
lora_r=0, | |||
lora_alpha=1., | |||
lora_dropout=0., | |||
): | |||
super(MYDPMergedLinear, self).__init__() | |||
self.pretrained_module = pretrained_module | |||
self.lora_r = lora_r | |||
self.lora_alpha = lora_alpha | |||
self.lora_dropout = nn.Dropout(p=lora_dropout) | |||
if self.lora_r > 0: | |||
self.lora_A = nn.Linear(in_features=in_features, out_features=lora_r, bias=False) | |||
self.lora_B = nn.Linear(in_features=lora_r, out_features=out_features, bias=False) | |||
self.scaling = self.lora_alpha / lora_r | |||
self.reset_parameters() | |||
def forward(self, x: torch.Tensor): | |||
result = self.pretrained_module(x) | |||
if self.lora_r > 0: | |||
after_dropout = self.lora_dropout(x) | |||
after_A = self.lora_A(after_dropout) | |||
after_B = self.lora_B(after_A) | |||
result += after_B * self.scaling | |||
return result | |||
def reset_parameters(self): | |||
# self.linear.reset_parameters() | |||
if self.lora_r > 0: | |||
self.lora_A.reset_parameters() | |||
self.lora_B.weight.data.zero_() | |||
@staticmethod | |||
def from_transformers_conv1d( | |||
original_layer, | |||
lora_r=0, | |||
lora_alpha=1., | |||
lora_dropout=0., | |||
) -> "MYDPMergedLinear": | |||
lora_layer = MYDPMergedLinear( | |||
in_features=original_layer.weight.shape[0], | |||
out_features=original_layer.weight.shape[1], | |||
pretrained_module = original_layer, | |||
lora_r=lora_r, | |||
lora_alpha=lora_alpha, | |||
lora_dropout=lora_dropout, | |||
).to(original_layer.weight.device) | |||
return lora_layer | |||
def convert_gpt2_attention_to_lora( | |||
model: transformers.GPT2PreTrainedModel, | |||
lora_r=0, | |||
lora_alpha=1., | |||
lora_dropout=0., | |||
) -> transformers.GPT2PreTrainedModel: | |||
if not isinstance(model, transformers.GPT2PreTrainedModel): | |||
raise TypeError("Requires a GPT2 model") | |||
if not hasattr(model, "h") and hasattr(model, "transformer"): | |||
transformer = model.transformer | |||
else: | |||
transformer = model | |||
for h_i in transformer.h: | |||
new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||
original_layer=h_i.attn.c_attn, | |||
lora_r=lora_r, | |||
lora_alpha=lora_alpha, | |||
lora_dropout=lora_dropout, | |||
) | |||
h_i.attn.c_attn = new_layer | |||
return model | |||
def mutate_model(model: torch.nn.Module, lora_r=0, lora_alpha=1., lora_dropout=0.): | |||
for name, module in model.named_children(): | |||
if name == "c_attn": | |||
new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||
original_layer=module, | |||
lora_r=lora_r, | |||
lora_alpha=lora_alpha, | |||
lora_dropout=lora_dropout, | |||
) | |||
setattr(model, name, new_layer) | |||
else: | |||
mutate_model(module, lora_r, lora_alpha, lora_dropout) # recursively call the function on the module | |||
def mark_only_lora_as_trainable(model: torch.nn.Module) -> None: | |||
model.requires_grad_(True) | |||
for n, p in model.named_parameters(): | |||
if 'lora_' not in n: | |||
p.requires_grad = False | |||
class AdapterLayer(nn.Module): | |||
def __init__( | |||
self, | |||
emb_dim: int, | |||
bottleneck_size: int, | |||
bias = True | |||
): | |||
super().__init__() | |||
self.sharif_llm_adapter = nn.Sequential( | |||
nn.Linear(emb_dim, bottleneck_size, bias=bias), | |||
nn.ReLU(), | |||
nn.Linear(bottleneck_size, emb_dim, bias=bias) | |||
) | |||
def forward(self, x: torch.Tensor): | |||
output = x + self.sharif_llm_adapter(x) | |||
return output | |||
class FeedForwardAdapterWrapper(nn.Module): | |||
def __init__( | |||
self, | |||
original_module: GPT2MLP, | |||
bottleneck_size: int, | |||
emb_dim, | |||
bias = True | |||
): | |||
super().__init__() | |||
assert isinstance(original_module, GPT2MLP) | |||
self.original_module = original_module | |||
self.adapter = AdapterLayer(emb_dim, bottleneck_size, bias=bias) | |||
def forward(self, x: torch.Tensor): | |||
output = self.original_module(x) | |||
output = self.adapter(output) | |||
return output | |||
def mutate_model_recursive_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||
for name, module in model.named_children(): | |||
if isinstance(module, GPT2MLP): | |||
feed_forward_with_adapter = FeedForwardAdapterWrapper(module, bottleneck_size, emb_dim, bias) | |||
setattr(model, name, feed_forward_with_adapter) | |||
else: | |||
mutate_model_recursive_adapter(module, bottleneck_size, emb_dim, bias) # recursively call the function on the module | |||
def mutate_model_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||
if hasattr(model, '_mutated'): | |||
print("Model already contains adapter layers! \n Try reloading the model.") | |||
return | |||
mutate_model_recursive_adapter(model, bottleneck_size, emb_dim, bias) | |||
model._mutated = True |
@@ -0,0 +1,211 @@ | |||
from transformers import get_linear_schedule_with_warmup | |||
import logging | |||
import torch | |||
import torch.nn.functional as F | |||
from torch.optim import AdamW | |||
from tqdm import tqdm | |||
from opacus import PrivacyEngine | |||
from opacus.utils.batch_memory_manager import BatchMemoryManager | |||
import wandb | |||
import math | |||
from model import save_model | |||
from torch.optim.lr_scheduler import StepLR | |||
class Trainer: | |||
def __init__(self, cfg, model, train_loader, checkpoint=None, second_trainer=False): | |||
if second_trainer: | |||
self.epochs = cfg.epochs_two | |||
self.lr = cfg.lr_two | |||
self.weight_decay = cfg.weight_decay_two | |||
else: | |||
self.epochs = cfg.epochs | |||
self.lr = cfg.lr | |||
self.weight_decay = cfg.weight_decay | |||
self.optimizer = AdamW(model.parameters(), lr=self.lr, weight_decay=self.weight_decay, eps=cfg.optimizer_eps) | |||
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||
total_steps = len(train_loader) * self.gradient_accumulation_steps * self.epochs | |||
if cfg.scheduler: | |||
if cfg.scheduler_type == "linear": | |||
warmup_steps = cfg.scheduler_warmup_steps if cfg.scheduler_warmup_steps else cfg.scheduler_warmup_ratio*total_steps | |||
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) | |||
elif cfg.scheduler_type == "steplr": | |||
self.scheduler = StepLR(self.optimizer, step_size=cfg.scheduler_step_size, gamma=cfg.scheduler_gamma) | |||
self.dp = cfg.dp | |||
self.model = model | |||
self.cfg = cfg | |||
self.save_path = f"{cfg.media_path}generation_saved_models/{cfg.dataset}/{cfg.peft_mode}" | |||
self.model_name = self.cfg.run_name if self.cfg.run_name else "best_model" | |||
if cfg.dp: | |||
self.model.train() | |||
self.privacy_engine = PrivacyEngine( | |||
accountant="rdp", | |||
) | |||
if checkpoint: | |||
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||
module=self.model, | |||
optimizer=self.optimizer, | |||
data_loader=train_loader, | |||
target_epsilon=cfg.epsilon, | |||
target_delta=cfg.delta, | |||
epochs=self.epochs, | |||
max_grad_norm=cfg.clipping_threshold, | |||
) | |||
def train_step(self, train_loader): | |||
train_loss = 0 | |||
self.model.train() | |||
self.optimizer.zero_grad() | |||
if self.dp: | |||
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
# Move batch tensors to the same device as the model | |||
batch = prepare_inputs(batch) | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
# Forward pass | |||
outputs = self.model(**batch) | |||
loss = outputs.loss | |||
loss.backward() | |||
train_loss += loss.item() | |||
self.optimizer.step() | |||
self.optimizer.zero_grad() | |||
if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||
self.scheduler.step() | |||
if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||
self.scheduler.step() | |||
else: | |||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
# Move batch tensors to the same device as the model | |||
batch = prepare_inputs(batch) | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
# Forward pass | |||
outputs = self.model(**batch) | |||
loss = outputs.loss | |||
loss.backward() | |||
train_loss += loss.item() | |||
self.optimizer.step() | |||
self.optimizer.zero_grad() | |||
if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||
self.scheduler.step() | |||
if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||
self.scheduler.step() | |||
return train_loss/len(train_loader) | |||
def evaluate_step(self, val_loader): | |||
# Evaluation loop | |||
val_loss = 0 | |||
self.model.eval() | |||
with torch.no_grad(): | |||
for batch in tqdm(val_loader): | |||
# Move batch tensors to the same device as the model | |||
batch = prepare_inputs(batch) | |||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
outputs = self.model(**batch) | |||
loss = compute_loss_per_input(outputs, batch) | |||
val_loss += loss.mean().item() | |||
return val_loss/len(val_loader) | |||
def train_and_evaluate(self, epochs, train_loader, val_loader): | |||
best_validation_loss = None | |||
best_epoch = 0 | |||
wandb_log = [] | |||
for epoch in range(epochs): | |||
log_data = {} | |||
train_loss = self.train_step(train_loader) | |||
log_data["train_loss"] = train_loss | |||
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||
val_loss = self.evaluate_step(val_loader=val_loader) | |||
log_data["validation_loss"] = val_loss | |||
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||
if best_validation_loss is None or val_loss < best_validation_loss: | |||
best_validation_loss = val_loss | |||
best_epoch = epoch | |||
save_model(self.model, self.cfg.peft_mode, self.save_path, self.model_name) | |||
logging.info(f"Model improved and saved for epoch {epoch+1}") | |||
wandb_log.append(log_data) | |||
logging.info("Best results:") | |||
if self.cfg.dp: | |||
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||
logging.info(f"Best validatin loss: {best_validation_loss} for Epoch: {best_epoch+1}") | |||
if self.cfg.use_wandb: | |||
for i, epoch_data in enumerate(wandb_log): | |||
wandb.log(epoch_data) | |||
def prepare_inputs(batch): | |||
batch.pop('src_attn', None) | |||
batch.pop('tgt_attn', None) | |||
batch.pop('src', None) | |||
return batch | |||
def compute_loss_per_input(outputs, batch): | |||
logits = outputs.logits | |||
shift_logits = logits[..., :-1, :].contiguous() | |||
shift_labels = batch["labels"][..., 1:].contiguous() | |||
seq_lens = (shift_labels != -100).sum(dim=1) | |||
loss = F.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none") | |||
loss = loss.sum(dim=1) / seq_lens | |||
return loss | |||
def save_evaluation_output(outputs, path): | |||
with open(path, "w") as file: | |||
for strings in outputs: | |||
for string in strings: | |||
file.write(string + "\n") | |||
# file.write("\n") | |||
file.close() | |||
def generate_evaluation_output(model, tokenizer, data, device, max_length, beam_size=5, do_sample=False, num_return_sequences=1): | |||
generated_texts = [] | |||
prev = None | |||
for entry in tqdm(data): | |||
if prev != entry["meaning_representation"]: | |||
prev = entry["meaning_representation"] | |||
prompt = f"{entry['meaning_representation']} {tokenizer.eos_token}" | |||
inputs = tokenizer(prompt, return_tensors="pt", truncation=True) | |||
inputs = {key: val.to(device) for key, val in inputs.items()} | |||
with torch.no_grad(): | |||
outputs = model.generate(**inputs, | |||
num_beams=beam_size, | |||
max_length=max_length, | |||
do_sample=do_sample, | |||
early_stopping=True, | |||
min_length=5, | |||
num_return_sequences=num_return_sequences, | |||
bad_words_ids = [[628], [198], [tokenizer.pad_token_id]], | |||
pad_token_id=tokenizer.eos_token_id, | |||
repetition_penalty=1, | |||
top_k=0, | |||
top_p=0.9) | |||
temp_generated_texts = [] | |||
for output in outputs: | |||
generated_text = tokenizer.decode(output[len(inputs["input_ids"][0]):], skip_special_tokens=True) | |||
temp_generated_texts.append(generated_text.strip()) | |||
generated_texts.append(temp_generated_texts) | |||
return generated_texts | |||
@@ -0,0 +1,42 @@ | |||
import torch | |||
def clean_hyperparameters(hyperparameters: dict): | |||
if hyperparameters["scheduler"] == 0: | |||
hyperparameters.pop("scheduler_type", None) | |||
hyperparameters.pop("scheduler_warmup_ratio", None) | |||
hyperparameters.pop("scheduler_warmup_steps", None) | |||
hyperparameters.pop("scheduler_step_size", None) | |||
hyperparameters.pop("scheduler_gamma", None) | |||
if hyperparameters["peft_mode"] != "lora": | |||
hyperparameters.pop("rank", None) | |||
hyperparameters.pop("alpha", None) | |||
hyperparameters.pop("drop_out", None) | |||
if hyperparameters["peft_mode"] != "adapter" and hyperparameters["peft_mode"] != "adapterbitfit": | |||
hyperparameters.pop("reduction_factor", None) | |||
if hyperparameters["dp"] == 0: | |||
hyperparameters.pop("epsilon", None) | |||
hyperparameters.pop("delta", None) | |||
hyperparameters.pop("clipping_mode", None) | |||
hyperparameters.pop("clipping_threshold", None) | |||
if hyperparameters["use_wandb"] == 0: | |||
hyperparameters.pop("wandb_project_name", None) | |||
hyperparameters.pop("use_wandb", None) | |||
if hyperparameters["two_step_training"] == 0: | |||
hyperparameters.pop("lr_two", None) | |||
hyperparameters.pop("virtual_batch_size_two", None) | |||
hyperparameters.pop("epochs_two", None) | |||
hyperparameters.pop("weight_decay_two", None) | |||
hyperparameters.pop("f", None) | |||
hyperparameters.pop("media_path", None) | |||
hyperparameters.pop("model_cache_path", None) | |||
return hyperparameters | |||
def copy_model_weights(model1, model2): | |||
model1.eval() | |||
model2.eval() | |||
params1 = model1.parameters() | |||
params2 = model2.parameters() | |||
with torch.no_grad(): | |||
for param1, param2 in zip(params1, params2): | |||
param2.data.copy_(param1.data) |
@@ -0,0 +1,3 @@ | |||
# Privacy-Preserving Fine-tuning of Parameter-Efficient Language Models | |||
Details comming soon! |