# Senetnece Classification Task on GLUE Benchmark | |||||
Details comming soom! |
import argparse | |||||
import torch | |||||
from media import media_path | |||||
class Config: | |||||
def __init__(self): | |||||
self.parser = argparse.ArgumentParser() | |||||
self.add_arguments() | |||||
self.args = self.parse() | |||||
self.post_process() | |||||
def parse(self): | |||||
return self.parser.parse_args() | |||||
def add_arguments(self): | |||||
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||||
self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||||
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||||
self.parser.add_argument('--batch_size', type=int, default=16, help='batch size for training ') | |||||
self.parser.add_argument('--virtual_batch_size', type=int, default=16, help='batch size for updating model parameters') | |||||
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||||
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||||
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||||
self.parser.add_argument('--optimizer_eps', type=float, default=1e-8, help='optimizer eps') | |||||
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||||
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||||
self.parser.add_argument('--max_length', type=int, default=128, help='Max length for tokenization') | |||||
self.parser.add_argument('--peft_mode', type=str, default='lora', choices=['lora', 'bitfit', 'full', 'lorabitfit'], help='PEFT mode for fine-tuning') | |||||
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||||
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||||
self.parser.add_argument('--dataset', type=str, default='sst2', choices=['sst2', 'mnli', 'qqp', 'qnli'], help='Dataset name') | |||||
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||||
self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||||
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||||
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||||
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||||
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||||
self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||||
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||||
self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||||
self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes lora then bitfit") | |||||
def post_process(self): | |||||
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||||
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||||
self.args.media_path = media_path |
from config import Config | |||||
from src.model import prepare_model | |||||
from src.data import prepare_data | |||||
from src.train import Trainer | |||||
import os | |||||
import random | |||||
import numpy as np | |||||
import torch | |||||
import wandb | |||||
import logging | |||||
import transformers | |||||
import warnings | |||||
warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes ") | |||||
transformers.logging.set_verbosity_error() | |||||
def set_seeds(seed: int): | |||||
os.environ['PYTHONHASHSEED'] = str(seed) | |||||
random.seed(seed) | |||||
np.random.seed(seed) | |||||
torch.manual_seed(seed) | |||||
torch.cuda.manual_seed(seed) | |||||
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||||
transformers.set_seed(seed) | |||||
def copy_model_weights(model1, model2): | |||||
model1.eval() | |||||
model2.eval() | |||||
params1 = model1.parameters() | |||||
params2 = model2.parameters() | |||||
with torch.no_grad(): | |||||
for param1, param2 in zip(params1, params2): | |||||
param2.data.copy_(param1.data) | |||||
# Returns number of trainbale parameters of the model | |||||
def get_number_of_trainable_parameters(model): | |||||
return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||||
# Returns number of parameters of the model | |||||
def get_number_of_parameters(model): | |||||
return sum(p.numel() for p in model.parameters()) | |||||
def main(cfg): | |||||
set_seeds(cfg.seed) | |||||
model, tokenizer = prepare_model(cfg) | |||||
num_of_all_params = get_number_of_parameters(model) | |||||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||||
train_loader, val_loader_one, val_loader_two = prepare_data(cfg, tokenizer) | |||||
logging.info("Data is ready") | |||||
trainer = Trainer(cfg, model, train_loader) | |||||
trainer.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||||
if cfg.two_step_training: | |||||
if cfg.dp: | |||||
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||||
model_two, _ = prepare_model(cfg) | |||||
copy_model_weights(model, model_two) | |||||
del model | |||||
model = model_two | |||||
for a, b in model.roberta.named_parameters(): | |||||
if 'bias' in a: | |||||
b.requires_grad = True | |||||
else: | |||||
b.requires_grad = False | |||||
logging.info("New Model adjusted") | |||||
num_of_all_params = get_number_of_parameters(model) | |||||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||||
trainer_two = Trainer(cfg, model, train_loader, checkpoint="temp.pth") | |||||
trainer_two.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||||
if cfg.use_wandb: | |||||
wandb.finish() | |||||
if __name__ == "__main__": | |||||
cfg = Config().args | |||||
log_path = "logs/" | |||||
if not os.path.exists(log_path): | |||||
os.makedirs(log_path) | |||||
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||||
if cfg.use_wandb: | |||||
wandb.login(key="YOUR_KEY") | |||||
if cfg.run_name: | |||||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||||
else: | |||||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||||
log_file_name = wandb.run.name | |||||
logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||||
logging.info("Start of the logging") | |||||
hyperparameters = {key: value for key, value in vars(cfg).items()} | |||||
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||||
logging.info("config:\n" + hyperparameters_str) | |||||
main(cfg) |
media_path = "YOUR_MEDIA_PATH" |
from datasets import load_from_disk | |||||
from torch.utils.data import DataLoader | |||||
from torch.utils.data import WeightedRandomSampler | |||||
TASK_TO_KEYS = { | |||||
"mnli": ("premise", "hypothesis"), | |||||
"qnli": ("question", "sentence"), | |||||
"qqp": ("question1", "question2"), | |||||
"sst2": ("sentence", None), | |||||
} | |||||
def prepare_data(cfg, tokenizer): | |||||
dataset = load_from_disk(f"{cfg.media_path}saved_datasets/{cfg.dataset}") | |||||
sentence1_key, sentence2_key = TASK_TO_KEYS[cfg.dataset] | |||||
if cfg.toy_example: | |||||
dataset["train"] = dataset["train"].select(range(1024)) | |||||
def tokenize(batch): | |||||
args = ((batch[sentence1_key],) if sentence2_key is None else (batch[sentence1_key], batch[sentence2_key])) | |||||
return tokenizer(*args, padding="max_length", truncation=True, max_length=cfg.max_length) | |||||
dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset)) | |||||
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) | |||||
cfg.train_data_size = len(dataset['train']) | |||||
sampler = WeightedRandomSampler([cfg.virtual_batch_size/cfg.train_data_size for _ in range(cfg.train_data_size)], num_samples=cfg.train_data_size, replacement=True) | |||||
train_loader = DataLoader(dataset['train'], batch_size=cfg.virtual_batch_size, sampler=sampler, drop_last=True) | |||||
validation_loader_one = None | |||||
validation_loader_two = None | |||||
if cfg.dataset == "mnli": | |||||
if cfg.toy_example: | |||||
dataset["validation_matched"] = dataset["validation_matched"].select(range(100)) | |||||
dataset["validation_mismatched"] = dataset["validation_mismatched"].select(range(100)) | |||||
validation_loader_one = DataLoader(dataset['validation_matched'], batch_size=cfg.batch_size) | |||||
validation_loader_two = DataLoader(dataset['validation_mismatched'], batch_size=cfg.batch_size) | |||||
else: | |||||
if cfg.toy_example: | |||||
dataset["validation"] = dataset["validation"].select(range(100)) | |||||
validation_loader_one = DataLoader(dataset['validation'], batch_size=cfg.batch_size) | |||||
return train_loader, validation_loader_one, validation_loader_two |
from transformers import RobertaForSequenceClassification, RobertaTokenizer | |||||
import logging | |||||
import torch | |||||
def prepare_model(cfg): | |||||
tokenizer = RobertaTokenizer.from_pretrained(f"{cfg.media_path}models/roberta-large-tokenizer") | |||||
model = RobertaForSequenceClassification.from_pretrained(f"{cfg.media_path}models/roberta-large-model") | |||||
if cfg.dataset == 'mnli': | |||||
model.classifier.out_proj = torch.nn.Linear(model.classifier.out_proj.in_features, 3, bias=True) | |||||
# adjust model parameters | |||||
if cfg.peft_mode == "lora": | |||||
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||||
freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||||
logging.info("LoRA model loaded") | |||||
elif cfg.peft_mode == "bitfit": | |||||
for a, b in model.roberta.named_parameters(): | |||||
if not 'bias' in a: | |||||
b.requires_grad = False | |||||
logging.info("BiTFiT model loaded") | |||||
elif cfg.peft_mode == "lorabitfit": | |||||
mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||||
freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||||
if cfg.two_step_training == 0: | |||||
for a, b in model.roberta.named_parameters(): | |||||
if 'bias' in a: | |||||
b.requires_grad = True | |||||
logging.info("LoRA and BiTFiT combined model loaded") | |||||
elif cfg.peft_mode == "full": | |||||
logging.info("Full model loaded") | |||||
else: | |||||
logging.info("No acceptable model to load") | |||||
model.to(cfg.device) | |||||
return model, tokenizer | |||||
class LoRALayer(torch.nn.Module): | |||||
def __init__( | |||||
self, | |||||
module: torch.nn.Linear, | |||||
rank: int , | |||||
alpha: float | |||||
): | |||||
super().__init__() | |||||
self.rank = rank | |||||
self.alpha = alpha | |||||
self.scaling = self.alpha / self.rank # scaling factor | |||||
self.in_dim = module.in_features | |||||
self.out_dim = module.out_features | |||||
self.pretrained = module | |||||
self.sharif_llm_A = torch.nn.Linear(self.in_dim, self.rank, bias=False) | |||||
torch.nn.init.kaiming_normal_(self.sharif_llm_A.weight) | |||||
self.sharif_llm_B = torch.nn.Linear(self.rank, self.out_dim, bias=False) | |||||
torch.nn.init.zeros_(self.sharif_llm_B.weight) | |||||
def forward(self, x: torch.Tensor): | |||||
pretrained_out = self.pretrained(x) | |||||
lora_out = self.sharif_llm_A(x) # x@A | |||||
lora_out = self.sharif_llm_B(lora_out) # x@A@B | |||||
lora_out = self.scaling * lora_out # Scale by the scaling factor | |||||
return pretrained_out + lora_out # x@W + x@A@B*(scaling_factor) | |||||
def mutate_model(model: torch.nn.Module, rank: int, alpha: float): | |||||
""" | |||||
Replaces all linear layers in the model with LoRALinear layers. | |||||
Freeze all params except LoRA params. | |||||
""" | |||||
# make sure there are no LoRALayer is in the model; return if there are any | |||||
for name, module in model.named_modules(): | |||||
if isinstance(module, LoRALayer): | |||||
logging.info("Model already contains LoRALinear layers! \n Try reloading the model.") | |||||
return | |||||
# we want to replace all query and value Linear modules with LoRALayer | |||||
for name, module in model.named_children(): | |||||
# if the module is linear and the name is for query or value | |||||
if isinstance(module, torch.nn.Linear) and (name == 'query' or name == 'value'): | |||||
# replace the module with LoRALayer | |||||
lora_layer = LoRALayer(module, rank, alpha) | |||||
setattr(model, name, lora_layer) | |||||
else: | |||||
mutate_model(module, rank, alpha) # recursively call the function on the module | |||||
def freeze_non_LoRA(model, peft_key): | |||||
for param_name, weights in model.named_parameters(): | |||||
weights.requires_grad = peft_key in param_name |
from transformers import get_linear_schedule_with_warmup | |||||
import logging | |||||
import torch | |||||
from torch.nn import CrossEntropyLoss | |||||
from torch.optim import AdamW | |||||
from tqdm import tqdm | |||||
from sklearn.metrics import accuracy_score | |||||
import wandb | |||||
import math | |||||
from opacus import PrivacyEngine | |||||
from opacus.utils.batch_memory_manager import BatchMemoryManager | |||||
class Trainer: | |||||
def __init__(self, cfg, model, train_loader, checkpoint=None): | |||||
self.criterion = CrossEntropyLoss() | |||||
self.val_criterion = CrossEntropyLoss() | |||||
self.optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, eps=cfg.optimizer_eps) | |||||
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||||
total_steps = math.ceil(len(train_loader) / self.gradient_accumulation_steps) * cfg.epochs | |||||
if cfg.scheduler: | |||||
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=cfg.scheduler_warmup_ratio*total_steps, num_training_steps=total_steps) | |||||
self.dp = cfg.dp | |||||
self.model = model | |||||
self.cfg = cfg | |||||
if cfg.dp: | |||||
self.model.train() | |||||
self.privacy_engine = PrivacyEngine( | |||||
accountant="rdp", | |||||
) | |||||
if checkpoint: | |||||
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||||
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||||
module=self.model, | |||||
optimizer=self.optimizer, | |||||
data_loader=train_loader, | |||||
target_epsilon=cfg.epsilon, | |||||
target_delta=cfg.delta, | |||||
epochs=cfg.epochs, | |||||
max_grad_norm=cfg.clipping_threshold, | |||||
) | |||||
def train_step(self, train_loader): | |||||
train_loss = 0 | |||||
self.model.train() | |||||
self.optimizer.zero_grad() | |||||
if self.cfg.dp: | |||||
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||||
# Move batch tensors to the same device as the model | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
# Forward pass | |||||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||||
loss = self.criterion(outputs.logits, batch["label"]) | |||||
loss.backward() | |||||
train_loss += loss.mean().item() | |||||
self.optimizer.step() | |||||
self.optimizer.zero_grad() | |||||
if self.cfg.scheduler: | |||||
self.scheduler.step() | |||||
else: | |||||
for batch_number, batch in tqdm(enumerate(train_loader, 1), total=len(train_loader)): | |||||
# Move batch tensors to the same device as the model | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
# Forward pass | |||||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||||
loss = self.criterion(outputs.logits, batch["label"]) | |||||
loss.backward() | |||||
train_loss += loss.mean().item() | |||||
self.optimizer.step() | |||||
self.optimizer.zero_grad() | |||||
if self.cfg.scheduler: | |||||
self.scheduler.step() | |||||
return train_loss/len(train_loader) | |||||
def evaluate_step(self, val_loader): | |||||
# Evaluation loop | |||||
val_loss = 0 | |||||
self.model.eval() | |||||
predictions = [] | |||||
true_labels = [] | |||||
with torch.no_grad(): | |||||
for batch in tqdm(val_loader): | |||||
# Move batch tensors to the same device as the model | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
# Forward pass and compute validation loss | |||||
outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||||
loss = self.val_criterion(outputs.logits, batch["label"]) | |||||
_, preds = torch.max(outputs.logits, dim=1) | |||||
predictions.extend(preds.tolist()) | |||||
true_labels.extend(batch["label"].tolist()) | |||||
val_loss += loss.item() | |||||
accuracy = accuracy_score(true_labels, predictions) | |||||
return accuracy , val_loss/len(val_loader) | |||||
def train_and_evaluate(self, epochs, train_loader, val_loader_one, val_loader_two): | |||||
best_accuracy = 0 | |||||
best_accuracy_two = 0 | |||||
wandb_log = [] | |||||
for epoch in range(epochs): | |||||
log_data = {} | |||||
train_loss = self.train_step(train_loader) | |||||
log_data["train_loss"] = train_loss | |||||
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||||
accuracy, val_loss = self.evaluate_step(val_loader=val_loader_one) | |||||
log_data["validation_loss"] = val_loss | |||||
log_data["accuracy"] = accuracy | |||||
if accuracy > best_accuracy: | |||||
best_accuracy = accuracy | |||||
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||||
logging.info(f"Accuracy on validation set: {accuracy * 100} %") | |||||
if val_loader_two: | |||||
accuracy_two , val_loss_two = self.evaluate_step(val_loader=val_loader_two) | |||||
log_data["validation_two_loss"] = val_loss_two | |||||
log_data["accuracy_two"] = accuracy_two | |||||
if accuracy_two > best_accuracy_two: | |||||
best_accuracy_two = accuracy_two | |||||
logging.info(f"Epoch {epoch+1} Validation two loss: {val_loss_two}") | |||||
logging.info(f"Accuracy on validation two set: {accuracy_two * 100} %") | |||||
wandb_log.append(log_data) | |||||
logging.info("Best results:") | |||||
if self.cfg.dp: | |||||
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||||
logging.info(f"Best validatin accuracy: {best_accuracy}") | |||||
if val_loader_two: | |||||
logging.info(f"Second validation set accuracy: {best_accuracy_two}") | |||||
if self.cfg.use_wandb: | |||||
for i, epoch_data in enumerate(wandb_log): | |||||
wandb.log(epoch_data) |
# Text Generation Task on E2E dataset | |||||
Details comming soom! |
import argparse | |||||
import torch | |||||
from media import * | |||||
class Config: | |||||
def __init__(self): | |||||
self.parser = argparse.ArgumentParser() | |||||
self.add_arguments() | |||||
self.args = self.parse() | |||||
self.post_process() | |||||
def parse(self): | |||||
return self.parser.parse_args() | |||||
def add_arguments(self): | |||||
self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||||
# self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||||
self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||||
self.parser.add_argument('--batch_size', type=int, default=8, help='batch size for training') | |||||
self.parser.add_argument('--virtual_batch_size', type=int, default=8, help='batch size for updating model parameters') | |||||
self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||||
self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||||
self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||||
self.parser.add_argument('--optimizer_eps', type=float, default=1e-6, help='optimizer eps') | |||||
self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||||
self.parser.add_argument("--scheduler_type", type=str, default="linear", choices=['linear', 'steplr'], help="Scheduler types") | |||||
self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||||
self.parser.add_argument('--scheduler_warmup_steps', type=int, default=None, help='Warmup steps can be given directly') | |||||
self.parser.add_argument('--scheduler_step_size', type=int, default=1, help='Scheduler step size for stepLR scheduler') | |||||
self.parser.add_argument('--scheduler_gamma', type=float, default=0.5, help='Scheduler decrease rate for stepLR scheduler') | |||||
self.parser.add_argument('--model_name', type=str, default='gpt2', choices=['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'], help='PEFT mode for fine-tuning') | |||||
self.parser.add_argument('--seq_length', type=int, default=128, help='Max length for tokenization') | |||||
self.parser.add_argument('--peft_mode', type=str, default='bitfit', choices=['lora', 'bitfit', 'full', 'lorabitfit', 'adapter', 'adapterbitfit'], help='PEFT mode for fine-tuning') | |||||
self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||||
self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||||
self.parser.add_argument('--drop_out', type=float, default=0.0, help='Dropout for lora') | |||||
self.parser.add_argument('--reduction_factor', type=int, default=16, help='Reduction_factor for adapter') | |||||
self.parser.add_argument('--dataset', type=str, default='e2e_nlg', choices=['e2e_nlg', 'dart'], help='Dataset name') | |||||
self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||||
self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||||
self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||||
self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||||
self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||||
self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||||
self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||||
self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||||
self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||||
self.parser.add_argument("--beam_size", type=int, default=5, help="Number of beans for generation") | |||||
self.parser.add_argument('--f', type=str, default=None, help='Path to Jupyter kernel JSON file') | |||||
self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes adapter or lora then bitfit") | |||||
self.parser.add_argument('--lr_two', type=float, default='2e-3', help='Learning rate for second step of training') | |||||
self.parser.add_argument('--virtual_batch_size_two', type=int, default=8, help='batch size for updating model parameters for scond step of training') | |||||
self.parser.add_argument('--epochs_two', type=int, default=5, help='Number of epochs for second step training') | |||||
self.parser.add_argument('--weight_decay_two', type=float, default=0.1, help='Weight decay for second optimizer') | |||||
def post_process(self): | |||||
assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||||
self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||||
self.args.media_path = media_path | |||||
self.args.model_cache_path = model_cache_path |
from datasets import load_from_disk | |||||
import torch | |||||
from torch.utils.data import DataLoader, WeightedRandomSampler | |||||
import copy | |||||
import sys | |||||
import torch | |||||
from torch.utils.data.dataset import Dataset | |||||
from transformers.tokenization_utils import PreTrainedTokenizer | |||||
from dataclasses import dataclass | |||||
from typing import Any, Callable, Dict, List, NewType, Tuple, Union | |||||
from torch.nn.utils.rnn import pad_sequence | |||||
from transformers.tokenization_utils import PreTrainedTokenizer | |||||
from transformers.tokenization_utils_base import BatchEncoding | |||||
def load_dataset(dataset_name, path, toy_example): | |||||
dataset = load_from_disk(f"{path}saved_datasets/{dataset_name}") | |||||
# toy example for develop | |||||
if toy_example == 1: | |||||
dataset["train"] = dataset["train"].select(range(1024)) | |||||
dataset["validation"] = dataset["validation"].select(range(512)) | |||||
return dataset | |||||
def load_dataloaders(dataset, dataset_name, batch_size, virtual_batch_size, tokenizer, seq_length, dp=1): | |||||
data_collator = DataCollatorForData2TextLanguageModeling(tokenizer) | |||||
if dataset_name == 'e2e_nlg': | |||||
train_dataset = E2ETextDataset(tokenizer, | |||||
dataset["train"]["meaning_representation"], | |||||
dataset["train"]["human_reference"], | |||||
seq_length, | |||||
tokenizer.bos_token, | |||||
tokenizer.eos_token, | |||||
seq_length) | |||||
validation_dataset = E2ETextDataset(tokenizer, | |||||
dataset["validation"]["meaning_representation"], | |||||
dataset["validation"]["human_reference"], | |||||
seq_length, | |||||
tokenizer.bos_token, | |||||
tokenizer.eos_token, | |||||
seq_length) | |||||
train_data_size = len(dataset["train"]) | |||||
if dp == 1: | |||||
sampler = WeightedRandomSampler([virtual_batch_size/train_data_size for _ in range(train_data_size)], num_samples=train_data_size, replacement=True) | |||||
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, sampler=sampler, drop_last=True, collate_fn=data_collator) | |||||
else: | |||||
train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, collate_fn=data_collator) | |||||
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator) | |||||
elif dataset_name == 'dart': | |||||
pass | |||||
return train_loader, validation_loader | |||||
# Copyright (c) Xuechen Li. All Rights Reserved. | |||||
# | |||||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||||
# you may not use this file except in compliance with the License. | |||||
# You may obtain a copy of the License at | |||||
# | |||||
# http://www.apache.org/licenses/LICENSE-2.0 | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, software | |||||
# distributed under the License is distributed on an "AS IS" BASIS, | |||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
# See the License for the specific language governing permissions and | |||||
# limitations under the License. | |||||
class E2ETextDataset(Dataset): | |||||
def __init__( | |||||
self, | |||||
tokenizer: PreTrainedTokenizer, | |||||
src_lines, | |||||
tgt_lines, | |||||
block_size: int, | |||||
bos_tok: str, | |||||
eos_tok: str, | |||||
max_seq_len=sys.maxsize, | |||||
max_examples=sys.maxsize, | |||||
**_, | |||||
): | |||||
src_lines = src_lines | |||||
tgt_lines = tgt_lines | |||||
edited_sents = [] | |||||
for src, tgt in zip(src_lines, tgt_lines): | |||||
sent = ' {} {} '.format(src, bos_tok) + tgt + ' {}'.format(eos_tok) | |||||
edited_sents.append(sent) | |||||
# --- Filter out super long sentences --- | |||||
new_src_lines, new_tgt_lines, new_edited_sents = [], [], [] | |||||
for src_line, tgt_line, edited_sent in zip(src_lines, tgt_lines, edited_sents): | |||||
tokenized_edited_sent = tokenizer.tokenize(edited_sent) | |||||
if len(tokenized_edited_sent) <= max_seq_len: | |||||
new_src_lines.append(src_line) | |||||
new_tgt_lines.append(tgt_line) | |||||
new_edited_sents.append(edited_sent) | |||||
del src_line, tgt_line, edited_sent | |||||
src_lines, tgt_lines, edited_sents = new_src_lines, new_tgt_lines, new_edited_sents | |||||
# --------------------------------------- | |||||
# --- Truncate the dataset if necessary; this must be after the length filtering. --- | |||||
src_lines = src_lines[:max_examples] | |||||
tgt_lines = tgt_lines[:max_examples] | |||||
edited_sents = edited_sents[:max_examples] | |||||
# --- | |||||
batch_encoding = tokenizer( | |||||
edited_sents, | |||||
add_special_tokens=True, | |||||
truncation=True, | |||||
max_length=block_size, | |||||
is_split_into_words=False, | |||||
) | |||||
self.examples = batch_encoding["input_ids"] | |||||
self.labels = copy.deepcopy(self.examples) | |||||
# split into category words: | |||||
ssl_lst = [] | |||||
for ss in src_lines: | |||||
ssl = [la.split(':')[0].strip() for la in ss.split('|')] | |||||
ssl_lst.append(ssl) | |||||
self.src_cat = tokenizer( | |||||
ssl_lst, | |||||
add_special_tokens=True, | |||||
truncation=True, | |||||
max_length=block_size, | |||||
is_split_into_words=True | |||||
)['input_ids'] | |||||
self.src_sent = [] | |||||
self.tgt_sent = [] | |||||
# temp_src_len = 0 | |||||
# temp_tgt_len = 0 | |||||
# temp_count = 0 | |||||
separator = tokenizer(bos_tok, add_special_tokens=False)['input_ids'][0] | |||||
for i, elem in enumerate(self.labels): | |||||
sep_idx = elem.index(separator) + 1 | |||||
self.src_sent.append(self.examples[i][:sep_idx - 1]) | |||||
self.tgt_sent.append(self.examples[i][sep_idx - 1:]) | |||||
self.labels[i][:sep_idx] = [-100] * sep_idx # Doesn't contribute to loss. | |||||
# temp_src_len += sep_idx - 1 | |||||
# temp_tgt_len += len(elem) - (sep_idx - 1) | |||||
# temp_count += 1 | |||||
# print('tgt_avg: ', temp_tgt_len / temp_count) | |||||
# print('src_avg: ', temp_src_len / temp_count) | |||||
# print('ratios: ', temp_src_len / temp_tgt_len) | |||||
# print(self.labels[0]) | |||||
# print(self.examples[0]) | |||||
# print(edited_sents[0]) | |||||
# print(self.src_sent[0]) | |||||
# print(self.tgt_sent[0]) | |||||
# print(self.src_cat[0]) | |||||
assert len(self.src_cat) == len(self.examples) | |||||
def __len__(self): | |||||
return len(self.examples) | |||||
def __getitem__(self, i): | |||||
return ( | |||||
torch.tensor(self.examples[i], dtype=torch.long), | |||||
torch.tensor(self.labels[i], dtype=torch.long), | |||||
torch.tensor(self.src_sent[i], dtype=torch.long), | |||||
torch.tensor(self.tgt_sent[i], dtype=torch.long), | |||||
torch.tensor(self.src_cat[i], dtype=torch.long), | |||||
) | |||||
# InputDataClass = NewType("InputDataClass", Any) | |||||
""" | |||||
A DataCollator is a function that takes a list of samples from a Dataset | |||||
and collate them into a batch, as a dictionary of Tensors. | |||||
""" | |||||
# DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]]) | |||||
@dataclass | |||||
class DataCollatorForData2TextLanguageModeling: | |||||
""" | |||||
Data collator used for language modeling. | |||||
- collates batches of tensors, honoring their tokenizer's pad_token | |||||
- preprocesses batches for masked language modeling | |||||
""" | |||||
tokenizer: PreTrainedTokenizer | |||||
mlm: bool = False | |||||
format_mode: str = 'cat' | |||||
mlm_probability: float = 0.15 | |||||
def __call__( | |||||
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||||
) -> Dict[str, torch.Tensor]: | |||||
if isinstance(examples[0], (dict, BatchEncoding)): | |||||
examples = [e["input_ids"] for e in examples] | |||||
input_ids, labels, src, tgt, cate = zip(*examples) | |||||
if self.mlm: | |||||
inputs, labels = self.mask_tokens(batch) | |||||
return {"input_ids": inputs, "labels": labels} | |||||
else: | |||||
if self.format_mode == 'cat': | |||||
mode_input = 3 | |||||
elif self.format_mode == 'peek': | |||||
mode_input = 1 | |||||
elif self.format_mode == 'nopeek': | |||||
mode_input = 2 | |||||
elif self.format_mode == 'infix': | |||||
mode_input = 4 | |||||
# mode_input = 1 # means that we take the input again. | |||||
# mode_input = 2 # means that we do not peek at src again. | |||||
# mode_input = 3 # means that we look at the categories, and see the input again. | |||||
if mode_input == 1: | |||||
# input, batch | |||||
batch = self._tensorize_batch(input_ids) | |||||
labels = self._tensorize_batch(labels) | |||||
src = self._tensorize_batch(src) | |||||
cate_batch, cate_attn = None, None | |||||
# tgt = self._tensorize_batch(tgt) | |||||
elif mode_input == 2: | |||||
# nopeek. | |||||
batch = self._tensorize_batch(tgt) | |||||
labels = batch.clone() | |||||
src = self._tensorize_batch(src) | |||||
cate_batch, cate_attn = None, None | |||||
elif mode_input == 3: | |||||
batch = self._tensorize_batch(input_ids) | |||||
labels = self._tensorize_batch(labels) | |||||
src = self._tensorize_batch(cate) | |||||
cate_batch, cate_attn = None, None | |||||
elif mode_input == 4: | |||||
batch = self._tensorize_batch(tgt) | |||||
labels = batch.clone() | |||||
src = self._tensorize_batch(src) | |||||
cate_batch = self._tensorize_batch(cate) | |||||
cate_attn = (cate_batch != self.tokenizer.pad_token_id) | |||||
labels[labels == self.tokenizer.pad_token_id] = -100 # tgt | |||||
src_attn = (src != self.tokenizer.pad_token_id) # src | |||||
tgt_attn = (batch != self.tokenizer.pad_token_id) # tgt | |||||
if cate_batch is None: | |||||
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn':tgt_attn, | |||||
'src':src} | |||||
else: | |||||
return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn': tgt_attn, | |||||
'src': src, "cate_batch":cate_batch, "cate_attn":cate_attn} | |||||
def _tensorize_batch( | |||||
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||||
) -> torch.Tensor: | |||||
# In order to accept both lists of lists and lists of Tensors | |||||
if isinstance(examples[0], (list, tuple)): | |||||
examples = [torch.tensor(e, dtype=torch.long) for e in examples] | |||||
length_of_first = examples[0].size(0) | |||||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) | |||||
if are_tensors_same_length: | |||||
return torch.stack(examples, dim=0) | |||||
else: | |||||
if self.tokenizer._pad_token is None: | |||||
raise ValueError( | |||||
"You are attempting to pad samples but the tokenizer you are using" | |||||
f" ({self.tokenizer.__class__.__name__}) does not have one." | |||||
) | |||||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) | |||||
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: | |||||
""" | |||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. | |||||
""" | |||||
if self.tokenizer.mask_token is None: | |||||
raise ValueError( | |||||
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." | |||||
) | |||||
labels = inputs.clone() | |||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) | |||||
probability_matrix = torch.full(labels.shape, self.mlm_probability) | |||||
special_tokens_mask = [ | |||||
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() | |||||
] | |||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) | |||||
if self.tokenizer._pad_token is not None: | |||||
padding_mask = labels.eq(self.tokenizer.pad_token_id) | |||||
probability_matrix.masked_fill_(padding_mask, value=0.0) | |||||
masked_indices = torch.bernoulli(probability_matrix).bool() | |||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens | |||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) | |||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices | |||||
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) | |||||
# 10% of the time, we replace masked input tokens with random word | |||||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced | |||||
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) | |||||
inputs[indices_random] = random_words[indices_random] | |||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged | |||||
return inputs, labels |
from config import Config | |||||
import os | |||||
import random | |||||
import numpy as np | |||||
import torch | |||||
import wandb | |||||
import logging | |||||
import transformers | |||||
import warnings | |||||
import subprocess | |||||
from model import load_model, prepare_model, get_number_of_trainable_parameters, load_model_weights, get_number_of_parameters | |||||
from data import load_dataset, load_dataloaders | |||||
from train import Trainer, generate_evaluation_output, save_evaluation_output | |||||
from utils import clean_hyperparameters, copy_model_weights | |||||
warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes") | |||||
transformers.logging.set_verbosity_error() | |||||
def set_seeds(seed: int): | |||||
os.environ['PYTHONHASHSEED'] = str(seed) | |||||
random.seed(seed) | |||||
np.random.seed(seed) | |||||
torch.manual_seed(seed) | |||||
torch.cuda.manual_seed(seed) | |||||
torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||||
transformers.set_seed(seed) | |||||
def run_metric_script(file_path): | |||||
result = subprocess.run(["e2e/measure_scores.py", "-p", "e2e_ref.txt", file_path], stdout=subprocess.PIPE) | |||||
output = result.stdout.decode('utf-8') | |||||
lines = output.split('\n') | |||||
return lines[-7:-2] | |||||
def main(cfg): | |||||
set_seeds(cfg.seed) | |||||
model, tokenizer = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||||
model = prepare_model(model, cfg) | |||||
num_of_all_params = get_number_of_parameters(model) | |||||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||||
logging.info(f"Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||||
dataset = load_dataset(cfg.dataset, cfg.media_path, cfg.toy_example) | |||||
cfg.train_data_size = len(dataset["train"]) | |||||
# dataset = tokenize_dataset(tokenizer, dataset, cfg.dataset, cfg.seq_length) | |||||
train_loader, validation_loader = load_dataloaders(dataset, cfg.dataset, cfg.batch_size, cfg.virtual_batch_size, tokenizer, cfg.seq_length, cfg.dp) | |||||
logging.info("Dataset loaded and tokenized") | |||||
trainer = Trainer(cfg, model, train_loader) | |||||
trainer.train_and_evaluate(cfg.epochs, train_loader, validation_loader) | |||||
if cfg.two_step_training and cfg.dp: | |||||
trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||||
model_two, _ = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||||
model_two = prepare_model(model_two, cfg) | |||||
copy_model_weights(model, model_two) | |||||
del model | |||||
model = model_two | |||||
for a, b in model.named_parameters(): | |||||
if 'bias' in a and not 'adapter' in a: | |||||
b.requires_grad = True | |||||
else: | |||||
b.requires_grad = False | |||||
logging.info("New Model adjusted") | |||||
num_of_all_params = get_number_of_parameters(model) | |||||
num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||||
percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||||
logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||||
logging.info(f"Percentage of trainable parameters: {percentage} %") | |||||
trainer_two = Trainer(cfg, model, train_loader, second_trainer=True) | |||||
trainer_two.train_and_evaluate(cfg.epochs_two, train_loader, validation_loader) | |||||
# evaluate model on test data | |||||
model.eval() | |||||
model = load_model_weights(model, cfg.peft_mode, f"{trainer.save_path}/{trainer.model_name}.pth") | |||||
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size) | |||||
output_path = f"{cfg.media_path}generation_eval_outputs/{cfg.dataset}/{cfg.peft_mode}" | |||||
if not os.path.exists(output_path): | |||||
os.makedirs(output_path) | |||||
output_name = cfg.run_name if cfg.run_name else "generation_output" | |||||
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v1.txt") | |||||
evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size, do_sample=True) | |||||
save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v2.txt") | |||||
logging.info("Generation for test data saved") | |||||
metrics = run_metric_script(f"{output_path}/{output_name}-v1.txt") | |||||
logging.info("Metrics without sampling:") | |||||
for metric in metrics: | |||||
logging.info(metric) | |||||
metrics = run_metric_script(f"{output_path}/{output_name}-v2.txt") | |||||
logging.info("Metrics with sampling:") | |||||
for metric in metrics: | |||||
logging.info(metric) | |||||
if cfg.use_wandb: | |||||
wandb.finish() | |||||
if __name__ == "__main__": | |||||
cfg = Config().args | |||||
log_path = f"logs/{cfg.dataset}/{cfg.peft_mode}/" | |||||
if not os.path.exists(log_path): | |||||
os.makedirs(log_path) | |||||
log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||||
if cfg.use_wandb: | |||||
wandb.login(key="YOUR_KEY") | |||||
if cfg.run_name: | |||||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||||
else: | |||||
wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||||
log_file_name = wandb.run.name | |||||
logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||||
logging.info("Start of the logging") | |||||
hyperparameters = {key: value for key, value in vars(cfg).items()} | |||||
hyperparameters = clean_hyperparameters(hyperparameters) | |||||
hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||||
logging.info("config:\n" + hyperparameters_str) | |||||
main(cfg) |
media_path = "YOUR_MEDIA_PATH" | |||||
model_cache_path = "YOUR_CACHE_PATH" |
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel | |||||
import torch | |||||
import transformers | |||||
from torch import nn | |||||
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP | |||||
import os | |||||
# Loads model and its tokenizer | |||||
def load_model(model_name, cache_dir="."): | |||||
tokenizer = GPT2Tokenizer.from_pretrained(f"{cache_dir}gpt2/{model_name}-tokenizer") | |||||
model = GPT2LMHeadModel.from_pretrained(f"{cache_dir}gpt2/{model_name}-model") | |||||
add_pad_token(model, tokenizer) | |||||
model.requires_grad_(False) | |||||
return model, tokenizer | |||||
# Adds padding token to the tokenizer and model embedding layer | |||||
def add_pad_token(model, tokenizer): | |||||
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |||||
model.resize_token_embeddings(len(tokenizer)) | |||||
a = model.get_input_embeddings().weight | |||||
a.data[-1] = a.data[:-1].mean(dim=0) | |||||
# Returns number of trainbale parameters of the model | |||||
def get_number_of_trainable_parameters(model): | |||||
return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||||
# Returns number of parameters of the model | |||||
def get_number_of_parameters(model): | |||||
return sum(p.numel() for p in model.parameters()) | |||||
# Mutates model structure and adjusts trainable parameters | |||||
def prepare_model(model, cfg): | |||||
if cfg.peft_mode == 'bitfit': | |||||
for a, b in model.named_parameters(): | |||||
if 'bias' in a: | |||||
b.requires_grad = True | |||||
elif cfg.peft_mode == 'lora': | |||||
model.requires_grad_(True) | |||||
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||||
mark_only_lora_as_trainable(model) | |||||
elif cfg.peft_mode == 'lorabitfit': | |||||
model.requires_grad_(True) | |||||
model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||||
mark_only_lora_as_trainable(model) | |||||
if cfg.two_step_training == 0: | |||||
for a, b in model.named_parameters(): | |||||
if 'bias' in a: | |||||
b.requires_grad = True | |||||
elif cfg.peft_mode == 'full': | |||||
model.requires_grad_(True) | |||||
elif cfg.peft_mode == 'adapter': | |||||
model.requires_grad_(False) | |||||
bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||||
mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||||
for a, b in model.named_parameters(): | |||||
if 'adapter' in a: | |||||
b.requires_grad = True | |||||
elif cfg.peft_mode == 'adapterbitfit': | |||||
model.requires_grad_(False) | |||||
bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||||
mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||||
if cfg.two_step_training == 0: | |||||
for a, b in model.named_parameters(): | |||||
if 'adapter' in a or 'bias' in a: | |||||
b.requires_grad = True | |||||
else: | |||||
for a, b in model.named_parameters(): | |||||
if 'adapter' in a: | |||||
b.requires_grad = True | |||||
model.to(cfg.device) | |||||
return model | |||||
def save_model(model, peft_mode, save_path, model_name): | |||||
if not os.path.exists(save_path): | |||||
os.makedirs(save_path) | |||||
if peft_mode == "bitfit": | |||||
bias_params = {} | |||||
for name, param in model.named_parameters(): | |||||
if 'bias' in name: | |||||
bias_params[name] = param.data.clone() | |||||
torch.save(bias_params, f'{save_path}/{model_name}.pth') | |||||
elif peft_mode == 'lora': | |||||
lora_params = {} | |||||
for name, param in model.named_parameters(): | |||||
if 'lora' in name: | |||||
lora_params[name] = param.data.clone() | |||||
torch.save(lora_params, f'{save_path}/{model_name}.pth') | |||||
elif peft_mode == 'lorabitfit': | |||||
lorabitfit_params = {} | |||||
for name, param in model.named_parameters(): | |||||
if 'lora' in name or 'bias' in name: | |||||
lorabitfit_params[name] = param.data.clone() | |||||
torch.save(lorabitfit_params, f'{save_path}/{model_name}.pth') | |||||
elif peft_mode == 'full': | |||||
pass | |||||
elif peft_mode == 'adapter': | |||||
adapter_params = {} | |||||
for name, param in model.named_parameters(): | |||||
if 'adapter' in name: | |||||
adapter_params[name] = param.data.clone() | |||||
torch.save(adapter_params, f'{save_path}/{model_name}.pth') | |||||
elif peft_mode == 'adapterbitfit': | |||||
adapterbitfit_params = {} | |||||
for name, param in model.named_parameters(): | |||||
if 'adapter' in name or 'bias' in name: | |||||
adapterbitfit_params[name] = param.data.clone() | |||||
torch.save(adapterbitfit_params, f'{save_path}/{model_name}.pth') | |||||
def load_model_weights(model, peft_mode, path): | |||||
if peft_mode == 'full': | |||||
pass | |||||
else: | |||||
model_weights = torch.load(path) | |||||
with torch.no_grad(): | |||||
for name, param in model.named_parameters(): | |||||
if name in model_weights: | |||||
param.copy_(model_weights[name]) | |||||
return model | |||||
# Copyright (c) Xuechen Li. All Rights Reserved. | |||||
# | |||||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||||
# you may not use this file except in compliance with the License. | |||||
# You may obtain a copy of the License at | |||||
# | |||||
# http://www.apache.org/licenses/LICENSE-2.0 | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, software | |||||
# distributed under the License is distributed on an "AS IS" BASIS, | |||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
# See the License for the specific language governing permissions and | |||||
# limitations under the License. | |||||
""" | |||||
LoRA layers. | |||||
This version does not have merged weights for zero latency inference. It makes the code easier to read and maintain. | |||||
Adapted from | |||||
https://github.com/microsoft/LoRA | |||||
https://www.microsoft.com/en-us/research/project/dp-transformers/ | |||||
""" | |||||
class MYDPMergedLinear(nn.Module): | |||||
def __init__( | |||||
self, | |||||
in_features: int, | |||||
out_features: int, | |||||
pretrained_module, | |||||
lora_r=0, | |||||
lora_alpha=1., | |||||
lora_dropout=0., | |||||
): | |||||
super(MYDPMergedLinear, self).__init__() | |||||
self.pretrained_module = pretrained_module | |||||
self.lora_r = lora_r | |||||
self.lora_alpha = lora_alpha | |||||
self.lora_dropout = nn.Dropout(p=lora_dropout) | |||||
if self.lora_r > 0: | |||||
self.lora_A = nn.Linear(in_features=in_features, out_features=lora_r, bias=False) | |||||
self.lora_B = nn.Linear(in_features=lora_r, out_features=out_features, bias=False) | |||||
self.scaling = self.lora_alpha / lora_r | |||||
self.reset_parameters() | |||||
def forward(self, x: torch.Tensor): | |||||
result = self.pretrained_module(x) | |||||
if self.lora_r > 0: | |||||
after_dropout = self.lora_dropout(x) | |||||
after_A = self.lora_A(after_dropout) | |||||
after_B = self.lora_B(after_A) | |||||
result += after_B * self.scaling | |||||
return result | |||||
def reset_parameters(self): | |||||
# self.linear.reset_parameters() | |||||
if self.lora_r > 0: | |||||
self.lora_A.reset_parameters() | |||||
self.lora_B.weight.data.zero_() | |||||
@staticmethod | |||||
def from_transformers_conv1d( | |||||
original_layer, | |||||
lora_r=0, | |||||
lora_alpha=1., | |||||
lora_dropout=0., | |||||
) -> "MYDPMergedLinear": | |||||
lora_layer = MYDPMergedLinear( | |||||
in_features=original_layer.weight.shape[0], | |||||
out_features=original_layer.weight.shape[1], | |||||
pretrained_module = original_layer, | |||||
lora_r=lora_r, | |||||
lora_alpha=lora_alpha, | |||||
lora_dropout=lora_dropout, | |||||
).to(original_layer.weight.device) | |||||
return lora_layer | |||||
def convert_gpt2_attention_to_lora( | |||||
model: transformers.GPT2PreTrainedModel, | |||||
lora_r=0, | |||||
lora_alpha=1., | |||||
lora_dropout=0., | |||||
) -> transformers.GPT2PreTrainedModel: | |||||
if not isinstance(model, transformers.GPT2PreTrainedModel): | |||||
raise TypeError("Requires a GPT2 model") | |||||
if not hasattr(model, "h") and hasattr(model, "transformer"): | |||||
transformer = model.transformer | |||||
else: | |||||
transformer = model | |||||
for h_i in transformer.h: | |||||
new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||||
original_layer=h_i.attn.c_attn, | |||||
lora_r=lora_r, | |||||
lora_alpha=lora_alpha, | |||||
lora_dropout=lora_dropout, | |||||
) | |||||
h_i.attn.c_attn = new_layer | |||||
return model | |||||
def mutate_model(model: torch.nn.Module, lora_r=0, lora_alpha=1., lora_dropout=0.): | |||||
for name, module in model.named_children(): | |||||
if name == "c_attn": | |||||
new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||||
original_layer=module, | |||||
lora_r=lora_r, | |||||
lora_alpha=lora_alpha, | |||||
lora_dropout=lora_dropout, | |||||
) | |||||
setattr(model, name, new_layer) | |||||
else: | |||||
mutate_model(module, lora_r, lora_alpha, lora_dropout) # recursively call the function on the module | |||||
def mark_only_lora_as_trainable(model: torch.nn.Module) -> None: | |||||
model.requires_grad_(True) | |||||
for n, p in model.named_parameters(): | |||||
if 'lora_' not in n: | |||||
p.requires_grad = False | |||||
class AdapterLayer(nn.Module): | |||||
def __init__( | |||||
self, | |||||
emb_dim: int, | |||||
bottleneck_size: int, | |||||
bias = True | |||||
): | |||||
super().__init__() | |||||
self.sharif_llm_adapter = nn.Sequential( | |||||
nn.Linear(emb_dim, bottleneck_size, bias=bias), | |||||
nn.ReLU(), | |||||
nn.Linear(bottleneck_size, emb_dim, bias=bias) | |||||
) | |||||
def forward(self, x: torch.Tensor): | |||||
output = x + self.sharif_llm_adapter(x) | |||||
return output | |||||
class FeedForwardAdapterWrapper(nn.Module): | |||||
def __init__( | |||||
self, | |||||
original_module: GPT2MLP, | |||||
bottleneck_size: int, | |||||
emb_dim, | |||||
bias = True | |||||
): | |||||
super().__init__() | |||||
assert isinstance(original_module, GPT2MLP) | |||||
self.original_module = original_module | |||||
self.adapter = AdapterLayer(emb_dim, bottleneck_size, bias=bias) | |||||
def forward(self, x: torch.Tensor): | |||||
output = self.original_module(x) | |||||
output = self.adapter(output) | |||||
return output | |||||
def mutate_model_recursive_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||||
for name, module in model.named_children(): | |||||
if isinstance(module, GPT2MLP): | |||||
feed_forward_with_adapter = FeedForwardAdapterWrapper(module, bottleneck_size, emb_dim, bias) | |||||
setattr(model, name, feed_forward_with_adapter) | |||||
else: | |||||
mutate_model_recursive_adapter(module, bottleneck_size, emb_dim, bias) # recursively call the function on the module | |||||
def mutate_model_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||||
if hasattr(model, '_mutated'): | |||||
print("Model already contains adapter layers! \n Try reloading the model.") | |||||
return | |||||
mutate_model_recursive_adapter(model, bottleneck_size, emb_dim, bias) | |||||
model._mutated = True |
from transformers import get_linear_schedule_with_warmup | |||||
import logging | |||||
import torch | |||||
import torch.nn.functional as F | |||||
from torch.optim import AdamW | |||||
from tqdm import tqdm | |||||
from opacus import PrivacyEngine | |||||
from opacus.utils.batch_memory_manager import BatchMemoryManager | |||||
import wandb | |||||
import math | |||||
from model import save_model | |||||
from torch.optim.lr_scheduler import StepLR | |||||
class Trainer: | |||||
def __init__(self, cfg, model, train_loader, checkpoint=None, second_trainer=False): | |||||
if second_trainer: | |||||
self.epochs = cfg.epochs_two | |||||
self.lr = cfg.lr_two | |||||
self.weight_decay = cfg.weight_decay_two | |||||
else: | |||||
self.epochs = cfg.epochs | |||||
self.lr = cfg.lr | |||||
self.weight_decay = cfg.weight_decay | |||||
self.optimizer = AdamW(model.parameters(), lr=self.lr, weight_decay=self.weight_decay, eps=cfg.optimizer_eps) | |||||
self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||||
total_steps = len(train_loader) * self.gradient_accumulation_steps * self.epochs | |||||
if cfg.scheduler: | |||||
if cfg.scheduler_type == "linear": | |||||
warmup_steps = cfg.scheduler_warmup_steps if cfg.scheduler_warmup_steps else cfg.scheduler_warmup_ratio*total_steps | |||||
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) | |||||
elif cfg.scheduler_type == "steplr": | |||||
self.scheduler = StepLR(self.optimizer, step_size=cfg.scheduler_step_size, gamma=cfg.scheduler_gamma) | |||||
self.dp = cfg.dp | |||||
self.model = model | |||||
self.cfg = cfg | |||||
self.save_path = f"{cfg.media_path}generation_saved_models/{cfg.dataset}/{cfg.peft_mode}" | |||||
self.model_name = self.cfg.run_name if self.cfg.run_name else "best_model" | |||||
if cfg.dp: | |||||
self.model.train() | |||||
self.privacy_engine = PrivacyEngine( | |||||
accountant="rdp", | |||||
) | |||||
if checkpoint: | |||||
self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||||
self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||||
module=self.model, | |||||
optimizer=self.optimizer, | |||||
data_loader=train_loader, | |||||
target_epsilon=cfg.epsilon, | |||||
target_delta=cfg.delta, | |||||
epochs=self.epochs, | |||||
max_grad_norm=cfg.clipping_threshold, | |||||
) | |||||
def train_step(self, train_loader): | |||||
train_loss = 0 | |||||
self.model.train() | |||||
self.optimizer.zero_grad() | |||||
if self.dp: | |||||
with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||||
# Move batch tensors to the same device as the model | |||||
batch = prepare_inputs(batch) | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
# Forward pass | |||||
outputs = self.model(**batch) | |||||
loss = outputs.loss | |||||
loss.backward() | |||||
train_loss += loss.item() | |||||
self.optimizer.step() | |||||
self.optimizer.zero_grad() | |||||
if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||||
self.scheduler.step() | |||||
if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||||
self.scheduler.step() | |||||
else: | |||||
for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||||
# Move batch tensors to the same device as the model | |||||
batch = prepare_inputs(batch) | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
# Forward pass | |||||
outputs = self.model(**batch) | |||||
loss = outputs.loss | |||||
loss.backward() | |||||
train_loss += loss.item() | |||||
self.optimizer.step() | |||||
self.optimizer.zero_grad() | |||||
if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||||
self.scheduler.step() | |||||
if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||||
self.scheduler.step() | |||||
return train_loss/len(train_loader) | |||||
def evaluate_step(self, val_loader): | |||||
# Evaluation loop | |||||
val_loss = 0 | |||||
self.model.eval() | |||||
with torch.no_grad(): | |||||
for batch in tqdm(val_loader): | |||||
# Move batch tensors to the same device as the model | |||||
batch = prepare_inputs(batch) | |||||
batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||||
outputs = self.model(**batch) | |||||
loss = compute_loss_per_input(outputs, batch) | |||||
val_loss += loss.mean().item() | |||||
return val_loss/len(val_loader) | |||||
def train_and_evaluate(self, epochs, train_loader, val_loader): | |||||
best_validation_loss = None | |||||
best_epoch = 0 | |||||
wandb_log = [] | |||||
for epoch in range(epochs): | |||||
log_data = {} | |||||
train_loss = self.train_step(train_loader) | |||||
log_data["train_loss"] = train_loss | |||||
logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||||
val_loss = self.evaluate_step(val_loader=val_loader) | |||||
log_data["validation_loss"] = val_loss | |||||
logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||||
if best_validation_loss is None or val_loss < best_validation_loss: | |||||
best_validation_loss = val_loss | |||||
best_epoch = epoch | |||||
save_model(self.model, self.cfg.peft_mode, self.save_path, self.model_name) | |||||
logging.info(f"Model improved and saved for epoch {epoch+1}") | |||||
wandb_log.append(log_data) | |||||
logging.info("Best results:") | |||||
if self.cfg.dp: | |||||
logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||||
logging.info(f"Best validatin loss: {best_validation_loss} for Epoch: {best_epoch+1}") | |||||
if self.cfg.use_wandb: | |||||
for i, epoch_data in enumerate(wandb_log): | |||||
wandb.log(epoch_data) | |||||
def prepare_inputs(batch): | |||||
batch.pop('src_attn', None) | |||||
batch.pop('tgt_attn', None) | |||||
batch.pop('src', None) | |||||
return batch | |||||
def compute_loss_per_input(outputs, batch): | |||||
logits = outputs.logits | |||||
shift_logits = logits[..., :-1, :].contiguous() | |||||
shift_labels = batch["labels"][..., 1:].contiguous() | |||||
seq_lens = (shift_labels != -100).sum(dim=1) | |||||
loss = F.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none") | |||||
loss = loss.sum(dim=1) / seq_lens | |||||
return loss | |||||
def save_evaluation_output(outputs, path): | |||||
with open(path, "w") as file: | |||||
for strings in outputs: | |||||
for string in strings: | |||||
file.write(string + "\n") | |||||
# file.write("\n") | |||||
file.close() | |||||
def generate_evaluation_output(model, tokenizer, data, device, max_length, beam_size=5, do_sample=False, num_return_sequences=1): | |||||
generated_texts = [] | |||||
prev = None | |||||
for entry in tqdm(data): | |||||
if prev != entry["meaning_representation"]: | |||||
prev = entry["meaning_representation"] | |||||
prompt = f"{entry['meaning_representation']} {tokenizer.eos_token}" | |||||
inputs = tokenizer(prompt, return_tensors="pt", truncation=True) | |||||
inputs = {key: val.to(device) for key, val in inputs.items()} | |||||
with torch.no_grad(): | |||||
outputs = model.generate(**inputs, | |||||
num_beams=beam_size, | |||||
max_length=max_length, | |||||
do_sample=do_sample, | |||||
early_stopping=True, | |||||
min_length=5, | |||||
num_return_sequences=num_return_sequences, | |||||
bad_words_ids = [[628], [198], [tokenizer.pad_token_id]], | |||||
pad_token_id=tokenizer.eos_token_id, | |||||
repetition_penalty=1, | |||||
top_k=0, | |||||
top_p=0.9) | |||||
temp_generated_texts = [] | |||||
for output in outputs: | |||||
generated_text = tokenizer.decode(output[len(inputs["input_ids"][0]):], skip_special_tokens=True) | |||||
temp_generated_texts.append(generated_text.strip()) | |||||
generated_texts.append(temp_generated_texts) | |||||
return generated_texts | |||||
import torch | |||||
def clean_hyperparameters(hyperparameters: dict): | |||||
if hyperparameters["scheduler"] == 0: | |||||
hyperparameters.pop("scheduler_type", None) | |||||
hyperparameters.pop("scheduler_warmup_ratio", None) | |||||
hyperparameters.pop("scheduler_warmup_steps", None) | |||||
hyperparameters.pop("scheduler_step_size", None) | |||||
hyperparameters.pop("scheduler_gamma", None) | |||||
if hyperparameters["peft_mode"] != "lora": | |||||
hyperparameters.pop("rank", None) | |||||
hyperparameters.pop("alpha", None) | |||||
hyperparameters.pop("drop_out", None) | |||||
if hyperparameters["peft_mode"] != "adapter" and hyperparameters["peft_mode"] != "adapterbitfit": | |||||
hyperparameters.pop("reduction_factor", None) | |||||
if hyperparameters["dp"] == 0: | |||||
hyperparameters.pop("epsilon", None) | |||||
hyperparameters.pop("delta", None) | |||||
hyperparameters.pop("clipping_mode", None) | |||||
hyperparameters.pop("clipping_threshold", None) | |||||
if hyperparameters["use_wandb"] == 0: | |||||
hyperparameters.pop("wandb_project_name", None) | |||||
hyperparameters.pop("use_wandb", None) | |||||
if hyperparameters["two_step_training"] == 0: | |||||
hyperparameters.pop("lr_two", None) | |||||
hyperparameters.pop("virtual_batch_size_two", None) | |||||
hyperparameters.pop("epochs_two", None) | |||||
hyperparameters.pop("weight_decay_two", None) | |||||
hyperparameters.pop("f", None) | |||||
hyperparameters.pop("media_path", None) | |||||
hyperparameters.pop("model_cache_path", None) | |||||
return hyperparameters | |||||
def copy_model_weights(model1, model2): | |||||
model1.eval() | |||||
model2.eval() | |||||
params1 = model1.parameters() | |||||
params2 = model2.parameters() | |||||
with torch.no_grad(): | |||||
for param1, param2 in zip(params1, params2): | |||||
param2.data.copy_(param1.data) |
# Privacy-Preserving Fine-tuning of Parameter-Efficient Language Models | |||||
Details comming soon! |