| @@ -0,0 +1,3 @@ | |||
| # Senetnece Classification Task on GLUE Benchmark | |||
| Details comming soom! | |||
| @@ -0,0 +1,52 @@ | |||
| import argparse | |||
| import torch | |||
| from media import media_path | |||
| class Config: | |||
| def __init__(self): | |||
| self.parser = argparse.ArgumentParser() | |||
| self.add_arguments() | |||
| self.args = self.parse() | |||
| self.post_process() | |||
| def parse(self): | |||
| return self.parser.parse_args() | |||
| def add_arguments(self): | |||
| self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||
| self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||
| self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||
| self.parser.add_argument('--batch_size', type=int, default=16, help='batch size for training ') | |||
| self.parser.add_argument('--virtual_batch_size', type=int, default=16, help='batch size for updating model parameters') | |||
| self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||
| self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||
| self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||
| self.parser.add_argument('--optimizer_eps', type=float, default=1e-8, help='optimizer eps') | |||
| self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||
| self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||
| self.parser.add_argument('--max_length', type=int, default=128, help='Max length for tokenization') | |||
| self.parser.add_argument('--peft_mode', type=str, default='lora', choices=['lora', 'bitfit', 'full', 'lorabitfit'], help='PEFT mode for fine-tuning') | |||
| self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||
| self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||
| self.parser.add_argument('--dataset', type=str, default='sst2', choices=['sst2', 'mnli', 'qqp', 'qnli'], help='Dataset name') | |||
| self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||
| self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||
| self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||
| self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||
| self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||
| self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||
| self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||
| self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||
| self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||
| self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes lora then bitfit") | |||
| def post_process(self): | |||
| assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||
| self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||
| self.args.media_path = media_path | |||
| @@ -0,0 +1,107 @@ | |||
| from config import Config | |||
| from src.model import prepare_model | |||
| from src.data import prepare_data | |||
| from src.train import Trainer | |||
| import os | |||
| import random | |||
| import numpy as np | |||
| import torch | |||
| import wandb | |||
| import logging | |||
| import transformers | |||
| import warnings | |||
| warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes ") | |||
| transformers.logging.set_verbosity_error() | |||
| def set_seeds(seed: int): | |||
| os.environ['PYTHONHASHSEED'] = str(seed) | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||
| transformers.set_seed(seed) | |||
| def copy_model_weights(model1, model2): | |||
| model1.eval() | |||
| model2.eval() | |||
| params1 = model1.parameters() | |||
| params2 = model2.parameters() | |||
| with torch.no_grad(): | |||
| for param1, param2 in zip(params1, params2): | |||
| param2.data.copy_(param1.data) | |||
| # Returns number of trainbale parameters of the model | |||
| def get_number_of_trainable_parameters(model): | |||
| return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||
| # Returns number of parameters of the model | |||
| def get_number_of_parameters(model): | |||
| return sum(p.numel() for p in model.parameters()) | |||
| def main(cfg): | |||
| set_seeds(cfg.seed) | |||
| model, tokenizer = prepare_model(cfg) | |||
| num_of_all_params = get_number_of_parameters(model) | |||
| num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
| percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
| logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
| logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
| train_loader, val_loader_one, val_loader_two = prepare_data(cfg, tokenizer) | |||
| logging.info("Data is ready") | |||
| trainer = Trainer(cfg, model, train_loader) | |||
| trainer.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||
| if cfg.two_step_training: | |||
| if cfg.dp: | |||
| trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||
| model_two, _ = prepare_model(cfg) | |||
| copy_model_weights(model, model_two) | |||
| del model | |||
| model = model_two | |||
| for a, b in model.roberta.named_parameters(): | |||
| if 'bias' in a: | |||
| b.requires_grad = True | |||
| else: | |||
| b.requires_grad = False | |||
| logging.info("New Model adjusted") | |||
| num_of_all_params = get_number_of_parameters(model) | |||
| num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
| percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
| logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
| logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
| trainer_two = Trainer(cfg, model, train_loader, checkpoint="temp.pth") | |||
| trainer_two.train_and_evaluate(cfg.epochs, train_loader, val_loader_one, val_loader_two) | |||
| if cfg.use_wandb: | |||
| wandb.finish() | |||
| if __name__ == "__main__": | |||
| cfg = Config().args | |||
| log_path = "logs/" | |||
| if not os.path.exists(log_path): | |||
| os.makedirs(log_path) | |||
| log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||
| if cfg.use_wandb: | |||
| wandb.login(key="YOUR_KEY") | |||
| if cfg.run_name: | |||
| wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||
| else: | |||
| wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||
| log_file_name = wandb.run.name | |||
| logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||
| logging.info("Start of the logging") | |||
| hyperparameters = {key: value for key, value in vars(cfg).items()} | |||
| hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||
| logging.info("config:\n" + hyperparameters_str) | |||
| main(cfg) | |||
| @@ -0,0 +1 @@ | |||
| media_path = "YOUR_MEDIA_PATH" | |||
| @@ -0,0 +1,45 @@ | |||
| from datasets import load_from_disk | |||
| from torch.utils.data import DataLoader | |||
| from torch.utils.data import WeightedRandomSampler | |||
| TASK_TO_KEYS = { | |||
| "mnli": ("premise", "hypothesis"), | |||
| "qnli": ("question", "sentence"), | |||
| "qqp": ("question1", "question2"), | |||
| "sst2": ("sentence", None), | |||
| } | |||
| def prepare_data(cfg, tokenizer): | |||
| dataset = load_from_disk(f"{cfg.media_path}saved_datasets/{cfg.dataset}") | |||
| sentence1_key, sentence2_key = TASK_TO_KEYS[cfg.dataset] | |||
| if cfg.toy_example: | |||
| dataset["train"] = dataset["train"].select(range(1024)) | |||
| def tokenize(batch): | |||
| args = ((batch[sentence1_key],) if sentence2_key is None else (batch[sentence1_key], batch[sentence2_key])) | |||
| return tokenizer(*args, padding="max_length", truncation=True, max_length=cfg.max_length) | |||
| dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset)) | |||
| dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) | |||
| cfg.train_data_size = len(dataset['train']) | |||
| sampler = WeightedRandomSampler([cfg.virtual_batch_size/cfg.train_data_size for _ in range(cfg.train_data_size)], num_samples=cfg.train_data_size, replacement=True) | |||
| train_loader = DataLoader(dataset['train'], batch_size=cfg.virtual_batch_size, sampler=sampler, drop_last=True) | |||
| validation_loader_one = None | |||
| validation_loader_two = None | |||
| if cfg.dataset == "mnli": | |||
| if cfg.toy_example: | |||
| dataset["validation_matched"] = dataset["validation_matched"].select(range(100)) | |||
| dataset["validation_mismatched"] = dataset["validation_mismatched"].select(range(100)) | |||
| validation_loader_one = DataLoader(dataset['validation_matched'], batch_size=cfg.batch_size) | |||
| validation_loader_two = DataLoader(dataset['validation_mismatched'], batch_size=cfg.batch_size) | |||
| else: | |||
| if cfg.toy_example: | |||
| dataset["validation"] = dataset["validation"].select(range(100)) | |||
| validation_loader_one = DataLoader(dataset['validation'], batch_size=cfg.batch_size) | |||
| return train_loader, validation_loader_one, validation_loader_two | |||
| @@ -0,0 +1,91 @@ | |||
| from transformers import RobertaForSequenceClassification, RobertaTokenizer | |||
| import logging | |||
| import torch | |||
| def prepare_model(cfg): | |||
| tokenizer = RobertaTokenizer.from_pretrained(f"{cfg.media_path}models/roberta-large-tokenizer") | |||
| model = RobertaForSequenceClassification.from_pretrained(f"{cfg.media_path}models/roberta-large-model") | |||
| if cfg.dataset == 'mnli': | |||
| model.classifier.out_proj = torch.nn.Linear(model.classifier.out_proj.in_features, 3, bias=True) | |||
| # adjust model parameters | |||
| if cfg.peft_mode == "lora": | |||
| mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||
| freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||
| logging.info("LoRA model loaded") | |||
| elif cfg.peft_mode == "bitfit": | |||
| for a, b in model.roberta.named_parameters(): | |||
| if not 'bias' in a: | |||
| b.requires_grad = False | |||
| logging.info("BiTFiT model loaded") | |||
| elif cfg.peft_mode == "lorabitfit": | |||
| mutate_model(model.roberta, rank=cfg.rank, alpha=cfg.alpha) | |||
| freeze_non_LoRA(model.roberta, peft_key='sharif_llm') | |||
| if cfg.two_step_training == 0: | |||
| for a, b in model.roberta.named_parameters(): | |||
| if 'bias' in a: | |||
| b.requires_grad = True | |||
| logging.info("LoRA and BiTFiT combined model loaded") | |||
| elif cfg.peft_mode == "full": | |||
| logging.info("Full model loaded") | |||
| else: | |||
| logging.info("No acceptable model to load") | |||
| model.to(cfg.device) | |||
| return model, tokenizer | |||
| class LoRALayer(torch.nn.Module): | |||
| def __init__( | |||
| self, | |||
| module: torch.nn.Linear, | |||
| rank: int , | |||
| alpha: float | |||
| ): | |||
| super().__init__() | |||
| self.rank = rank | |||
| self.alpha = alpha | |||
| self.scaling = self.alpha / self.rank # scaling factor | |||
| self.in_dim = module.in_features | |||
| self.out_dim = module.out_features | |||
| self.pretrained = module | |||
| self.sharif_llm_A = torch.nn.Linear(self.in_dim, self.rank, bias=False) | |||
| torch.nn.init.kaiming_normal_(self.sharif_llm_A.weight) | |||
| self.sharif_llm_B = torch.nn.Linear(self.rank, self.out_dim, bias=False) | |||
| torch.nn.init.zeros_(self.sharif_llm_B.weight) | |||
| def forward(self, x: torch.Tensor): | |||
| pretrained_out = self.pretrained(x) | |||
| lora_out = self.sharif_llm_A(x) # x@A | |||
| lora_out = self.sharif_llm_B(lora_out) # x@A@B | |||
| lora_out = self.scaling * lora_out # Scale by the scaling factor | |||
| return pretrained_out + lora_out # x@W + x@A@B*(scaling_factor) | |||
| def mutate_model(model: torch.nn.Module, rank: int, alpha: float): | |||
| """ | |||
| Replaces all linear layers in the model with LoRALinear layers. | |||
| Freeze all params except LoRA params. | |||
| """ | |||
| # make sure there are no LoRALayer is in the model; return if there are any | |||
| for name, module in model.named_modules(): | |||
| if isinstance(module, LoRALayer): | |||
| logging.info("Model already contains LoRALinear layers! \n Try reloading the model.") | |||
| return | |||
| # we want to replace all query and value Linear modules with LoRALayer | |||
| for name, module in model.named_children(): | |||
| # if the module is linear and the name is for query or value | |||
| if isinstance(module, torch.nn.Linear) and (name == 'query' or name == 'value'): | |||
| # replace the module with LoRALayer | |||
| lora_layer = LoRALayer(module, rank, alpha) | |||
| setattr(model, name, lora_layer) | |||
| else: | |||
| mutate_model(module, rank, alpha) # recursively call the function on the module | |||
| def freeze_non_LoRA(model, peft_key): | |||
| for param_name, weights in model.named_parameters(): | |||
| weights.requires_grad = peft_key in param_name | |||
| @@ -0,0 +1,147 @@ | |||
| from transformers import get_linear_schedule_with_warmup | |||
| import logging | |||
| import torch | |||
| from torch.nn import CrossEntropyLoss | |||
| from torch.optim import AdamW | |||
| from tqdm import tqdm | |||
| from sklearn.metrics import accuracy_score | |||
| import wandb | |||
| import math | |||
| from opacus import PrivacyEngine | |||
| from opacus.utils.batch_memory_manager import BatchMemoryManager | |||
| class Trainer: | |||
| def __init__(self, cfg, model, train_loader, checkpoint=None): | |||
| self.criterion = CrossEntropyLoss() | |||
| self.val_criterion = CrossEntropyLoss() | |||
| self.optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, eps=cfg.optimizer_eps) | |||
| self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||
| total_steps = math.ceil(len(train_loader) / self.gradient_accumulation_steps) * cfg.epochs | |||
| if cfg.scheduler: | |||
| self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=cfg.scheduler_warmup_ratio*total_steps, num_training_steps=total_steps) | |||
| self.dp = cfg.dp | |||
| self.model = model | |||
| self.cfg = cfg | |||
| if cfg.dp: | |||
| self.model.train() | |||
| self.privacy_engine = PrivacyEngine( | |||
| accountant="rdp", | |||
| ) | |||
| if checkpoint: | |||
| self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||
| self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||
| module=self.model, | |||
| optimizer=self.optimizer, | |||
| data_loader=train_loader, | |||
| target_epsilon=cfg.epsilon, | |||
| target_delta=cfg.delta, | |||
| epochs=cfg.epochs, | |||
| max_grad_norm=cfg.clipping_threshold, | |||
| ) | |||
| def train_step(self, train_loader): | |||
| train_loss = 0 | |||
| self.model.train() | |||
| self.optimizer.zero_grad() | |||
| if self.cfg.dp: | |||
| with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||
| for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
| # Move batch tensors to the same device as the model | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| # Forward pass | |||
| outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
| loss = self.criterion(outputs.logits, batch["label"]) | |||
| loss.backward() | |||
| train_loss += loss.mean().item() | |||
| self.optimizer.step() | |||
| self.optimizer.zero_grad() | |||
| if self.cfg.scheduler: | |||
| self.scheduler.step() | |||
| else: | |||
| for batch_number, batch in tqdm(enumerate(train_loader, 1), total=len(train_loader)): | |||
| # Move batch tensors to the same device as the model | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| # Forward pass | |||
| outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
| loss = self.criterion(outputs.logits, batch["label"]) | |||
| loss.backward() | |||
| train_loss += loss.mean().item() | |||
| self.optimizer.step() | |||
| self.optimizer.zero_grad() | |||
| if self.cfg.scheduler: | |||
| self.scheduler.step() | |||
| return train_loss/len(train_loader) | |||
| def evaluate_step(self, val_loader): | |||
| # Evaluation loop | |||
| val_loss = 0 | |||
| self.model.eval() | |||
| predictions = [] | |||
| true_labels = [] | |||
| with torch.no_grad(): | |||
| for batch in tqdm(val_loader): | |||
| # Move batch tensors to the same device as the model | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| # Forward pass and compute validation loss | |||
| outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"]) | |||
| loss = self.val_criterion(outputs.logits, batch["label"]) | |||
| _, preds = torch.max(outputs.logits, dim=1) | |||
| predictions.extend(preds.tolist()) | |||
| true_labels.extend(batch["label"].tolist()) | |||
| val_loss += loss.item() | |||
| accuracy = accuracy_score(true_labels, predictions) | |||
| return accuracy , val_loss/len(val_loader) | |||
| def train_and_evaluate(self, epochs, train_loader, val_loader_one, val_loader_two): | |||
| best_accuracy = 0 | |||
| best_accuracy_two = 0 | |||
| wandb_log = [] | |||
| for epoch in range(epochs): | |||
| log_data = {} | |||
| train_loss = self.train_step(train_loader) | |||
| log_data["train_loss"] = train_loss | |||
| logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||
| accuracy, val_loss = self.evaluate_step(val_loader=val_loader_one) | |||
| log_data["validation_loss"] = val_loss | |||
| log_data["accuracy"] = accuracy | |||
| if accuracy > best_accuracy: | |||
| best_accuracy = accuracy | |||
| logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||
| logging.info(f"Accuracy on validation set: {accuracy * 100} %") | |||
| if val_loader_two: | |||
| accuracy_two , val_loss_two = self.evaluate_step(val_loader=val_loader_two) | |||
| log_data["validation_two_loss"] = val_loss_two | |||
| log_data["accuracy_two"] = accuracy_two | |||
| if accuracy_two > best_accuracy_two: | |||
| best_accuracy_two = accuracy_two | |||
| logging.info(f"Epoch {epoch+1} Validation two loss: {val_loss_two}") | |||
| logging.info(f"Accuracy on validation two set: {accuracy_two * 100} %") | |||
| wandb_log.append(log_data) | |||
| logging.info("Best results:") | |||
| if self.cfg.dp: | |||
| logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||
| logging.info(f"Best validatin accuracy: {best_accuracy}") | |||
| if val_loader_two: | |||
| logging.info(f"Second validation set accuracy: {best_accuracy_two}") | |||
| if self.cfg.use_wandb: | |||
| for i, epoch_data in enumerate(wandb_log): | |||
| wandb.log(epoch_data) | |||
| @@ -0,0 +1,3 @@ | |||
| # Text Generation Task on E2E dataset | |||
| Details comming soom! | |||
| @@ -0,0 +1,68 @@ | |||
| import argparse | |||
| import torch | |||
| from media import * | |||
| class Config: | |||
| def __init__(self): | |||
| self.parser = argparse.ArgumentParser() | |||
| self.add_arguments() | |||
| self.args = self.parse() | |||
| self.post_process() | |||
| def parse(self): | |||
| return self.parser.parse_args() | |||
| def add_arguments(self): | |||
| self.parser.add_argument('--device', type=int, default=0, help='Device number to use for training') | |||
| # self.parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPUs available') | |||
| self.parser.add_argument('--seed', type=int, default=1234, help='Set seed for reproducability') | |||
| self.parser.add_argument('--batch_size', type=int, default=8, help='batch size for training') | |||
| self.parser.add_argument('--virtual_batch_size', type=int, default=8, help='batch size for updating model parameters') | |||
| self.parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') | |||
| self.parser.add_argument('--lr', type=float, default='2e-3', help='Learning rate') | |||
| self.parser.add_argument('--weight_decay', type=float, default=0.1, help='Weight decay for optimizer') | |||
| self.parser.add_argument('--optimizer_eps', type=float, default=1e-6, help='optimizer eps') | |||
| self.parser.add_argument("--scheduler", type=int, default=1, help="Uses scheduler if 1") | |||
| self.parser.add_argument("--scheduler_type", type=str, default="linear", choices=['linear', 'steplr'], help="Scheduler types") | |||
| self.parser.add_argument('--scheduler_warmup_ratio', type=float, default=0.06, help='Scheduler warmup ratio * total steps = warmup steps') | |||
| self.parser.add_argument('--scheduler_warmup_steps', type=int, default=None, help='Warmup steps can be given directly') | |||
| self.parser.add_argument('--scheduler_step_size', type=int, default=1, help='Scheduler step size for stepLR scheduler') | |||
| self.parser.add_argument('--scheduler_gamma', type=float, default=0.5, help='Scheduler decrease rate for stepLR scheduler') | |||
| self.parser.add_argument('--model_name', type=str, default='gpt2', choices=['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'], help='PEFT mode for fine-tuning') | |||
| self.parser.add_argument('--seq_length', type=int, default=128, help='Max length for tokenization') | |||
| self.parser.add_argument('--peft_mode', type=str, default='bitfit', choices=['lora', 'bitfit', 'full', 'lorabitfit', 'adapter', 'adapterbitfit'], help='PEFT mode for fine-tuning') | |||
| self.parser.add_argument('--rank', type=int, default=8, help='Rank for lora') | |||
| self.parser.add_argument('--alpha', type=int, default=16, help='Alpha for lora') | |||
| self.parser.add_argument('--drop_out', type=float, default=0.0, help='Dropout for lora') | |||
| self.parser.add_argument('--reduction_factor', type=int, default=16, help='Reduction_factor for adapter') | |||
| self.parser.add_argument('--dataset', type=str, default='e2e_nlg', choices=['e2e_nlg', 'dart'], help='Dataset name') | |||
| self.parser.add_argument('--toy_example', type=int, default=0, help='if 1, the first 1024 data from train dataset will be used for training') | |||
| self.parser.add_argument("--dp", type=int, default=0, help="Fine-tune using differential privacy if 1") | |||
| self.parser.add_argument("--epsilon", type=int, default=3, help="Epsilon in privacy budget") | |||
| self.parser.add_argument("--delta", type=float, default=1e-5, help="Delta in privacy budget") | |||
| self.parser.add_argument('--clipping_mode', type=str, default='default', choices=['default', 'ghost'], help='Clipping mode for DP fine-tuning') | |||
| self.parser.add_argument("--clipping_threshold", type=float, default=0.1, help="Max grad norm") | |||
| self.parser.add_argument("--use_wandb", type=int, default=0, help="Uses wandb if 1") | |||
| self.parser.add_argument("--wandb_project_name", type=str, default="Project-DP", help="Wandb project name") | |||
| self.parser.add_argument("--run_name", type=str, default=None, help="run name") | |||
| self.parser.add_argument("--beam_size", type=int, default=5, help="Number of beans for generation") | |||
| self.parser.add_argument('--f', type=str, default=None, help='Path to Jupyter kernel JSON file') | |||
| self.parser.add_argument("--two_step_training", type=int, default=0, help="if 1, first finetunes adapter or lora then bitfit") | |||
| self.parser.add_argument('--lr_two', type=float, default='2e-3', help='Learning rate for second step of training') | |||
| self.parser.add_argument('--virtual_batch_size_two', type=int, default=8, help='batch size for updating model parameters for scond step of training') | |||
| self.parser.add_argument('--epochs_two', type=int, default=5, help='Number of epochs for second step training') | |||
| self.parser.add_argument('--weight_decay_two', type=float, default=0.1, help='Weight decay for second optimizer') | |||
| def post_process(self): | |||
| assert self.args.virtual_batch_size % self.args.batch_size == 0, "virtual_batch_size should be devisible by batch_size" | |||
| self.args.device = torch.device(f'cuda:{self.args.device}' if torch.cuda.is_available() else "cpu") | |||
| self.args.media_path = media_path | |||
| self.args.model_cache_path = model_cache_path | |||
| @@ -0,0 +1,309 @@ | |||
| from datasets import load_from_disk | |||
| import torch | |||
| from torch.utils.data import DataLoader, WeightedRandomSampler | |||
| import copy | |||
| import sys | |||
| import torch | |||
| from torch.utils.data.dataset import Dataset | |||
| from transformers.tokenization_utils import PreTrainedTokenizer | |||
| from dataclasses import dataclass | |||
| from typing import Any, Callable, Dict, List, NewType, Tuple, Union | |||
| from torch.nn.utils.rnn import pad_sequence | |||
| from transformers.tokenization_utils import PreTrainedTokenizer | |||
| from transformers.tokenization_utils_base import BatchEncoding | |||
| def load_dataset(dataset_name, path, toy_example): | |||
| dataset = load_from_disk(f"{path}saved_datasets/{dataset_name}") | |||
| # toy example for develop | |||
| if toy_example == 1: | |||
| dataset["train"] = dataset["train"].select(range(1024)) | |||
| dataset["validation"] = dataset["validation"].select(range(512)) | |||
| return dataset | |||
| def load_dataloaders(dataset, dataset_name, batch_size, virtual_batch_size, tokenizer, seq_length, dp=1): | |||
| data_collator = DataCollatorForData2TextLanguageModeling(tokenizer) | |||
| if dataset_name == 'e2e_nlg': | |||
| train_dataset = E2ETextDataset(tokenizer, | |||
| dataset["train"]["meaning_representation"], | |||
| dataset["train"]["human_reference"], | |||
| seq_length, | |||
| tokenizer.bos_token, | |||
| tokenizer.eos_token, | |||
| seq_length) | |||
| validation_dataset = E2ETextDataset(tokenizer, | |||
| dataset["validation"]["meaning_representation"], | |||
| dataset["validation"]["human_reference"], | |||
| seq_length, | |||
| tokenizer.bos_token, | |||
| tokenizer.eos_token, | |||
| seq_length) | |||
| train_data_size = len(dataset["train"]) | |||
| if dp == 1: | |||
| sampler = WeightedRandomSampler([virtual_batch_size/train_data_size for _ in range(train_data_size)], num_samples=train_data_size, replacement=True) | |||
| train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, sampler=sampler, drop_last=True, collate_fn=data_collator) | |||
| else: | |||
| train_loader = DataLoader(train_dataset, batch_size=virtual_batch_size, collate_fn=data_collator) | |||
| validation_loader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator) | |||
| elif dataset_name == 'dart': | |||
| pass | |||
| return train_loader, validation_loader | |||
| # Copyright (c) Xuechen Li. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| class E2ETextDataset(Dataset): | |||
| def __init__( | |||
| self, | |||
| tokenizer: PreTrainedTokenizer, | |||
| src_lines, | |||
| tgt_lines, | |||
| block_size: int, | |||
| bos_tok: str, | |||
| eos_tok: str, | |||
| max_seq_len=sys.maxsize, | |||
| max_examples=sys.maxsize, | |||
| **_, | |||
| ): | |||
| src_lines = src_lines | |||
| tgt_lines = tgt_lines | |||
| edited_sents = [] | |||
| for src, tgt in zip(src_lines, tgt_lines): | |||
| sent = ' {} {} '.format(src, bos_tok) + tgt + ' {}'.format(eos_tok) | |||
| edited_sents.append(sent) | |||
| # --- Filter out super long sentences --- | |||
| new_src_lines, new_tgt_lines, new_edited_sents = [], [], [] | |||
| for src_line, tgt_line, edited_sent in zip(src_lines, tgt_lines, edited_sents): | |||
| tokenized_edited_sent = tokenizer.tokenize(edited_sent) | |||
| if len(tokenized_edited_sent) <= max_seq_len: | |||
| new_src_lines.append(src_line) | |||
| new_tgt_lines.append(tgt_line) | |||
| new_edited_sents.append(edited_sent) | |||
| del src_line, tgt_line, edited_sent | |||
| src_lines, tgt_lines, edited_sents = new_src_lines, new_tgt_lines, new_edited_sents | |||
| # --------------------------------------- | |||
| # --- Truncate the dataset if necessary; this must be after the length filtering. --- | |||
| src_lines = src_lines[:max_examples] | |||
| tgt_lines = tgt_lines[:max_examples] | |||
| edited_sents = edited_sents[:max_examples] | |||
| # --- | |||
| batch_encoding = tokenizer( | |||
| edited_sents, | |||
| add_special_tokens=True, | |||
| truncation=True, | |||
| max_length=block_size, | |||
| is_split_into_words=False, | |||
| ) | |||
| self.examples = batch_encoding["input_ids"] | |||
| self.labels = copy.deepcopy(self.examples) | |||
| # split into category words: | |||
| ssl_lst = [] | |||
| for ss in src_lines: | |||
| ssl = [la.split(':')[0].strip() for la in ss.split('|')] | |||
| ssl_lst.append(ssl) | |||
| self.src_cat = tokenizer( | |||
| ssl_lst, | |||
| add_special_tokens=True, | |||
| truncation=True, | |||
| max_length=block_size, | |||
| is_split_into_words=True | |||
| )['input_ids'] | |||
| self.src_sent = [] | |||
| self.tgt_sent = [] | |||
| # temp_src_len = 0 | |||
| # temp_tgt_len = 0 | |||
| # temp_count = 0 | |||
| separator = tokenizer(bos_tok, add_special_tokens=False)['input_ids'][0] | |||
| for i, elem in enumerate(self.labels): | |||
| sep_idx = elem.index(separator) + 1 | |||
| self.src_sent.append(self.examples[i][:sep_idx - 1]) | |||
| self.tgt_sent.append(self.examples[i][sep_idx - 1:]) | |||
| self.labels[i][:sep_idx] = [-100] * sep_idx # Doesn't contribute to loss. | |||
| # temp_src_len += sep_idx - 1 | |||
| # temp_tgt_len += len(elem) - (sep_idx - 1) | |||
| # temp_count += 1 | |||
| # print('tgt_avg: ', temp_tgt_len / temp_count) | |||
| # print('src_avg: ', temp_src_len / temp_count) | |||
| # print('ratios: ', temp_src_len / temp_tgt_len) | |||
| # print(self.labels[0]) | |||
| # print(self.examples[0]) | |||
| # print(edited_sents[0]) | |||
| # print(self.src_sent[0]) | |||
| # print(self.tgt_sent[0]) | |||
| # print(self.src_cat[0]) | |||
| assert len(self.src_cat) == len(self.examples) | |||
| def __len__(self): | |||
| return len(self.examples) | |||
| def __getitem__(self, i): | |||
| return ( | |||
| torch.tensor(self.examples[i], dtype=torch.long), | |||
| torch.tensor(self.labels[i], dtype=torch.long), | |||
| torch.tensor(self.src_sent[i], dtype=torch.long), | |||
| torch.tensor(self.tgt_sent[i], dtype=torch.long), | |||
| torch.tensor(self.src_cat[i], dtype=torch.long), | |||
| ) | |||
| # InputDataClass = NewType("InputDataClass", Any) | |||
| """ | |||
| A DataCollator is a function that takes a list of samples from a Dataset | |||
| and collate them into a batch, as a dictionary of Tensors. | |||
| """ | |||
| # DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]]) | |||
| @dataclass | |||
| class DataCollatorForData2TextLanguageModeling: | |||
| """ | |||
| Data collator used for language modeling. | |||
| - collates batches of tensors, honoring their tokenizer's pad_token | |||
| - preprocesses batches for masked language modeling | |||
| """ | |||
| tokenizer: PreTrainedTokenizer | |||
| mlm: bool = False | |||
| format_mode: str = 'cat' | |||
| mlm_probability: float = 0.15 | |||
| def __call__( | |||
| self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||
| ) -> Dict[str, torch.Tensor]: | |||
| if isinstance(examples[0], (dict, BatchEncoding)): | |||
| examples = [e["input_ids"] for e in examples] | |||
| input_ids, labels, src, tgt, cate = zip(*examples) | |||
| if self.mlm: | |||
| inputs, labels = self.mask_tokens(batch) | |||
| return {"input_ids": inputs, "labels": labels} | |||
| else: | |||
| if self.format_mode == 'cat': | |||
| mode_input = 3 | |||
| elif self.format_mode == 'peek': | |||
| mode_input = 1 | |||
| elif self.format_mode == 'nopeek': | |||
| mode_input = 2 | |||
| elif self.format_mode == 'infix': | |||
| mode_input = 4 | |||
| # mode_input = 1 # means that we take the input again. | |||
| # mode_input = 2 # means that we do not peek at src again. | |||
| # mode_input = 3 # means that we look at the categories, and see the input again. | |||
| if mode_input == 1: | |||
| # input, batch | |||
| batch = self._tensorize_batch(input_ids) | |||
| labels = self._tensorize_batch(labels) | |||
| src = self._tensorize_batch(src) | |||
| cate_batch, cate_attn = None, None | |||
| # tgt = self._tensorize_batch(tgt) | |||
| elif mode_input == 2: | |||
| # nopeek. | |||
| batch = self._tensorize_batch(tgt) | |||
| labels = batch.clone() | |||
| src = self._tensorize_batch(src) | |||
| cate_batch, cate_attn = None, None | |||
| elif mode_input == 3: | |||
| batch = self._tensorize_batch(input_ids) | |||
| labels = self._tensorize_batch(labels) | |||
| src = self._tensorize_batch(cate) | |||
| cate_batch, cate_attn = None, None | |||
| elif mode_input == 4: | |||
| batch = self._tensorize_batch(tgt) | |||
| labels = batch.clone() | |||
| src = self._tensorize_batch(src) | |||
| cate_batch = self._tensorize_batch(cate) | |||
| cate_attn = (cate_batch != self.tokenizer.pad_token_id) | |||
| labels[labels == self.tokenizer.pad_token_id] = -100 # tgt | |||
| src_attn = (src != self.tokenizer.pad_token_id) # src | |||
| tgt_attn = (batch != self.tokenizer.pad_token_id) # tgt | |||
| if cate_batch is None: | |||
| return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn':tgt_attn, | |||
| 'src':src} | |||
| else: | |||
| return {"input_ids": batch, "labels": labels, 'src_attn': src_attn, 'tgt_attn': tgt_attn, | |||
| 'src': src, "cate_batch":cate_batch, "cate_attn":cate_attn} | |||
| def _tensorize_batch( | |||
| self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] | |||
| ) -> torch.Tensor: | |||
| # In order to accept both lists of lists and lists of Tensors | |||
| if isinstance(examples[0], (list, tuple)): | |||
| examples = [torch.tensor(e, dtype=torch.long) for e in examples] | |||
| length_of_first = examples[0].size(0) | |||
| are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) | |||
| if are_tensors_same_length: | |||
| return torch.stack(examples, dim=0) | |||
| else: | |||
| if self.tokenizer._pad_token is None: | |||
| raise ValueError( | |||
| "You are attempting to pad samples but the tokenizer you are using" | |||
| f" ({self.tokenizer.__class__.__name__}) does not have one." | |||
| ) | |||
| return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) | |||
| def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """ | |||
| Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. | |||
| """ | |||
| if self.tokenizer.mask_token is None: | |||
| raise ValueError( | |||
| "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." | |||
| ) | |||
| labels = inputs.clone() | |||
| # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) | |||
| probability_matrix = torch.full(labels.shape, self.mlm_probability) | |||
| special_tokens_mask = [ | |||
| self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() | |||
| ] | |||
| probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) | |||
| if self.tokenizer._pad_token is not None: | |||
| padding_mask = labels.eq(self.tokenizer.pad_token_id) | |||
| probability_matrix.masked_fill_(padding_mask, value=0.0) | |||
| masked_indices = torch.bernoulli(probability_matrix).bool() | |||
| labels[~masked_indices] = -100 # We only compute loss on masked tokens | |||
| # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) | |||
| indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices | |||
| inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) | |||
| # 10% of the time, we replace masked input tokens with random word | |||
| indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced | |||
| random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) | |||
| inputs[indices_random] = random_words[indices_random] | |||
| # The rest of the time (10% of the time) we keep the masked input tokens unchanged | |||
| return inputs, labels | |||
| @@ -0,0 +1,125 @@ | |||
| from config import Config | |||
| import os | |||
| import random | |||
| import numpy as np | |||
| import torch | |||
| import wandb | |||
| import logging | |||
| import transformers | |||
| import warnings | |||
| import subprocess | |||
| from model import load_model, prepare_model, get_number_of_trainable_parameters, load_model_weights, get_number_of_parameters | |||
| from data import load_dataset, load_dataloaders | |||
| from train import Trainer, generate_evaluation_output, save_evaluation_output | |||
| from utils import clean_hyperparameters, copy_model_weights | |||
| warnings.filterwarnings("ignore", "Using a non-full backward hook when the forward contains multiple autograd Nodes") | |||
| transformers.logging.set_verbosity_error() | |||
| def set_seeds(seed: int): | |||
| os.environ['PYTHONHASHSEED'] = str(seed) | |||
| random.seed(seed) | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |||
| transformers.set_seed(seed) | |||
| def run_metric_script(file_path): | |||
| result = subprocess.run(["e2e/measure_scores.py", "-p", "e2e_ref.txt", file_path], stdout=subprocess.PIPE) | |||
| output = result.stdout.decode('utf-8') | |||
| lines = output.split('\n') | |||
| return lines[-7:-2] | |||
| def main(cfg): | |||
| set_seeds(cfg.seed) | |||
| model, tokenizer = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||
| model = prepare_model(model, cfg) | |||
| num_of_all_params = get_number_of_parameters(model) | |||
| num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
| percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
| logging.info(f"Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
| logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
| dataset = load_dataset(cfg.dataset, cfg.media_path, cfg.toy_example) | |||
| cfg.train_data_size = len(dataset["train"]) | |||
| # dataset = tokenize_dataset(tokenizer, dataset, cfg.dataset, cfg.seq_length) | |||
| train_loader, validation_loader = load_dataloaders(dataset, cfg.dataset, cfg.batch_size, cfg.virtual_batch_size, tokenizer, cfg.seq_length, cfg.dp) | |||
| logging.info("Dataset loaded and tokenized") | |||
| trainer = Trainer(cfg, model, train_loader) | |||
| trainer.train_and_evaluate(cfg.epochs, train_loader, validation_loader) | |||
| if cfg.two_step_training and cfg.dp: | |||
| trainer.privacy_engine.save_checkpoint(path="temp.pth", module=model) | |||
| model_two, _ = load_model(cfg.model_name, cache_dir=cfg.model_cache_path) | |||
| model_two = prepare_model(model_two, cfg) | |||
| copy_model_weights(model, model_two) | |||
| del model | |||
| model = model_two | |||
| for a, b in model.named_parameters(): | |||
| if 'bias' in a and not 'adapter' in a: | |||
| b.requires_grad = True | |||
| else: | |||
| b.requires_grad = False | |||
| logging.info("New Model adjusted") | |||
| num_of_all_params = get_number_of_parameters(model) | |||
| num_of_trainbale_params = get_number_of_trainable_parameters(model) | |||
| percentage = round(100 * num_of_trainbale_params / num_of_all_params, 2) | |||
| logging.info(f"New Model loaded successfully and number of trainable params is: {num_of_trainbale_params} out of {num_of_all_params}") | |||
| logging.info(f"Percentage of trainable parameters: {percentage} %") | |||
| trainer_two = Trainer(cfg, model, train_loader, second_trainer=True) | |||
| trainer_two.train_and_evaluate(cfg.epochs_two, train_loader, validation_loader) | |||
| # evaluate model on test data | |||
| model.eval() | |||
| model = load_model_weights(model, cfg.peft_mode, f"{trainer.save_path}/{trainer.model_name}.pth") | |||
| evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size) | |||
| output_path = f"{cfg.media_path}generation_eval_outputs/{cfg.dataset}/{cfg.peft_mode}" | |||
| if not os.path.exists(output_path): | |||
| os.makedirs(output_path) | |||
| output_name = cfg.run_name if cfg.run_name else "generation_output" | |||
| save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v1.txt") | |||
| evaluation_output = generate_evaluation_output(model, tokenizer, dataset["test"], cfg.device, cfg.seq_length, cfg.beam_size, do_sample=True) | |||
| save_evaluation_output(evaluation_output, f"{output_path}/{output_name}-v2.txt") | |||
| logging.info("Generation for test data saved") | |||
| metrics = run_metric_script(f"{output_path}/{output_name}-v1.txt") | |||
| logging.info("Metrics without sampling:") | |||
| for metric in metrics: | |||
| logging.info(metric) | |||
| metrics = run_metric_script(f"{output_path}/{output_name}-v2.txt") | |||
| logging.info("Metrics with sampling:") | |||
| for metric in metrics: | |||
| logging.info(metric) | |||
| if cfg.use_wandb: | |||
| wandb.finish() | |||
| if __name__ == "__main__": | |||
| cfg = Config().args | |||
| log_path = f"logs/{cfg.dataset}/{cfg.peft_mode}/" | |||
| if not os.path.exists(log_path): | |||
| os.makedirs(log_path) | |||
| log_file_name = f"{cfg.run_name}.log" if cfg.run_name else "logs.log" | |||
| if cfg.use_wandb: | |||
| wandb.login(key="YOUR_KEY") | |||
| if cfg.run_name: | |||
| wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}", name=cfg.run_name) | |||
| else: | |||
| wandb.init(config=cfg, project=f"{cfg.wandb_project_name}-{cfg.dataset}") | |||
| log_file_name = wandb.run.name | |||
| logging.basicConfig(filename=f"{log_path}{log_file_name}", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True) | |||
| logging.info("Start of the logging") | |||
| hyperparameters = {key: value for key, value in vars(cfg).items()} | |||
| hyperparameters = clean_hyperparameters(hyperparameters) | |||
| hyperparameters_str = "\n".join([f"{key}: {value}" for key, value in hyperparameters.items()]) | |||
| logging.info("config:\n" + hyperparameters_str) | |||
| main(cfg) | |||
| @@ -0,0 +1,2 @@ | |||
| media_path = "YOUR_MEDIA_PATH" | |||
| model_cache_path = "YOUR_CACHE_PATH" | |||
| @@ -0,0 +1,299 @@ | |||
| from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel | |||
| import torch | |||
| import transformers | |||
| from torch import nn | |||
| from transformers.models.gpt2.modeling_gpt2 import GPT2MLP | |||
| import os | |||
| # Loads model and its tokenizer | |||
| def load_model(model_name, cache_dir="."): | |||
| tokenizer = GPT2Tokenizer.from_pretrained(f"{cache_dir}gpt2/{model_name}-tokenizer") | |||
| model = GPT2LMHeadModel.from_pretrained(f"{cache_dir}gpt2/{model_name}-model") | |||
| add_pad_token(model, tokenizer) | |||
| model.requires_grad_(False) | |||
| return model, tokenizer | |||
| # Adds padding token to the tokenizer and model embedding layer | |||
| def add_pad_token(model, tokenizer): | |||
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |||
| model.resize_token_embeddings(len(tokenizer)) | |||
| a = model.get_input_embeddings().weight | |||
| a.data[-1] = a.data[:-1].mean(dim=0) | |||
| # Returns number of trainbale parameters of the model | |||
| def get_number_of_trainable_parameters(model): | |||
| return sum(p.numel() for p in model.parameters() if p.requires_grad) | |||
| # Returns number of parameters of the model | |||
| def get_number_of_parameters(model): | |||
| return sum(p.numel() for p in model.parameters()) | |||
| # Mutates model structure and adjusts trainable parameters | |||
| def prepare_model(model, cfg): | |||
| if cfg.peft_mode == 'bitfit': | |||
| for a, b in model.named_parameters(): | |||
| if 'bias' in a: | |||
| b.requires_grad = True | |||
| elif cfg.peft_mode == 'lora': | |||
| model.requires_grad_(True) | |||
| model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||
| mark_only_lora_as_trainable(model) | |||
| elif cfg.peft_mode == 'lorabitfit': | |||
| model.requires_grad_(True) | |||
| model = convert_gpt2_attention_to_lora(model, cfg.rank, cfg.alpha, cfg.drop_out) | |||
| mark_only_lora_as_trainable(model) | |||
| if cfg.two_step_training == 0: | |||
| for a, b in model.named_parameters(): | |||
| if 'bias' in a: | |||
| b.requires_grad = True | |||
| elif cfg.peft_mode == 'full': | |||
| model.requires_grad_(True) | |||
| elif cfg.peft_mode == 'adapter': | |||
| model.requires_grad_(False) | |||
| bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||
| mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||
| for a, b in model.named_parameters(): | |||
| if 'adapter' in a: | |||
| b.requires_grad = True | |||
| elif cfg.peft_mode == 'adapterbitfit': | |||
| model.requires_grad_(False) | |||
| bottleneck_size = model.config.n_embd // cfg.reduction_factor | |||
| mutate_model_adapter(model, bottleneck_size, model.config.n_embd) | |||
| if cfg.two_step_training == 0: | |||
| for a, b in model.named_parameters(): | |||
| if 'adapter' in a or 'bias' in a: | |||
| b.requires_grad = True | |||
| else: | |||
| for a, b in model.named_parameters(): | |||
| if 'adapter' in a: | |||
| b.requires_grad = True | |||
| model.to(cfg.device) | |||
| return model | |||
| def save_model(model, peft_mode, save_path, model_name): | |||
| if not os.path.exists(save_path): | |||
| os.makedirs(save_path) | |||
| if peft_mode == "bitfit": | |||
| bias_params = {} | |||
| for name, param in model.named_parameters(): | |||
| if 'bias' in name: | |||
| bias_params[name] = param.data.clone() | |||
| torch.save(bias_params, f'{save_path}/{model_name}.pth') | |||
| elif peft_mode == 'lora': | |||
| lora_params = {} | |||
| for name, param in model.named_parameters(): | |||
| if 'lora' in name: | |||
| lora_params[name] = param.data.clone() | |||
| torch.save(lora_params, f'{save_path}/{model_name}.pth') | |||
| elif peft_mode == 'lorabitfit': | |||
| lorabitfit_params = {} | |||
| for name, param in model.named_parameters(): | |||
| if 'lora' in name or 'bias' in name: | |||
| lorabitfit_params[name] = param.data.clone() | |||
| torch.save(lorabitfit_params, f'{save_path}/{model_name}.pth') | |||
| elif peft_mode == 'full': | |||
| pass | |||
| elif peft_mode == 'adapter': | |||
| adapter_params = {} | |||
| for name, param in model.named_parameters(): | |||
| if 'adapter' in name: | |||
| adapter_params[name] = param.data.clone() | |||
| torch.save(adapter_params, f'{save_path}/{model_name}.pth') | |||
| elif peft_mode == 'adapterbitfit': | |||
| adapterbitfit_params = {} | |||
| for name, param in model.named_parameters(): | |||
| if 'adapter' in name or 'bias' in name: | |||
| adapterbitfit_params[name] = param.data.clone() | |||
| torch.save(adapterbitfit_params, f'{save_path}/{model_name}.pth') | |||
| def load_model_weights(model, peft_mode, path): | |||
| if peft_mode == 'full': | |||
| pass | |||
| else: | |||
| model_weights = torch.load(path) | |||
| with torch.no_grad(): | |||
| for name, param in model.named_parameters(): | |||
| if name in model_weights: | |||
| param.copy_(model_weights[name]) | |||
| return model | |||
| # Copyright (c) Xuechen Li. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| LoRA layers. | |||
| This version does not have merged weights for zero latency inference. It makes the code easier to read and maintain. | |||
| Adapted from | |||
| https://github.com/microsoft/LoRA | |||
| https://www.microsoft.com/en-us/research/project/dp-transformers/ | |||
| """ | |||
| class MYDPMergedLinear(nn.Module): | |||
| def __init__( | |||
| self, | |||
| in_features: int, | |||
| out_features: int, | |||
| pretrained_module, | |||
| lora_r=0, | |||
| lora_alpha=1., | |||
| lora_dropout=0., | |||
| ): | |||
| super(MYDPMergedLinear, self).__init__() | |||
| self.pretrained_module = pretrained_module | |||
| self.lora_r = lora_r | |||
| self.lora_alpha = lora_alpha | |||
| self.lora_dropout = nn.Dropout(p=lora_dropout) | |||
| if self.lora_r > 0: | |||
| self.lora_A = nn.Linear(in_features=in_features, out_features=lora_r, bias=False) | |||
| self.lora_B = nn.Linear(in_features=lora_r, out_features=out_features, bias=False) | |||
| self.scaling = self.lora_alpha / lora_r | |||
| self.reset_parameters() | |||
| def forward(self, x: torch.Tensor): | |||
| result = self.pretrained_module(x) | |||
| if self.lora_r > 0: | |||
| after_dropout = self.lora_dropout(x) | |||
| after_A = self.lora_A(after_dropout) | |||
| after_B = self.lora_B(after_A) | |||
| result += after_B * self.scaling | |||
| return result | |||
| def reset_parameters(self): | |||
| # self.linear.reset_parameters() | |||
| if self.lora_r > 0: | |||
| self.lora_A.reset_parameters() | |||
| self.lora_B.weight.data.zero_() | |||
| @staticmethod | |||
| def from_transformers_conv1d( | |||
| original_layer, | |||
| lora_r=0, | |||
| lora_alpha=1., | |||
| lora_dropout=0., | |||
| ) -> "MYDPMergedLinear": | |||
| lora_layer = MYDPMergedLinear( | |||
| in_features=original_layer.weight.shape[0], | |||
| out_features=original_layer.weight.shape[1], | |||
| pretrained_module = original_layer, | |||
| lora_r=lora_r, | |||
| lora_alpha=lora_alpha, | |||
| lora_dropout=lora_dropout, | |||
| ).to(original_layer.weight.device) | |||
| return lora_layer | |||
| def convert_gpt2_attention_to_lora( | |||
| model: transformers.GPT2PreTrainedModel, | |||
| lora_r=0, | |||
| lora_alpha=1., | |||
| lora_dropout=0., | |||
| ) -> transformers.GPT2PreTrainedModel: | |||
| if not isinstance(model, transformers.GPT2PreTrainedModel): | |||
| raise TypeError("Requires a GPT2 model") | |||
| if not hasattr(model, "h") and hasattr(model, "transformer"): | |||
| transformer = model.transformer | |||
| else: | |||
| transformer = model | |||
| for h_i in transformer.h: | |||
| new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||
| original_layer=h_i.attn.c_attn, | |||
| lora_r=lora_r, | |||
| lora_alpha=lora_alpha, | |||
| lora_dropout=lora_dropout, | |||
| ) | |||
| h_i.attn.c_attn = new_layer | |||
| return model | |||
| def mutate_model(model: torch.nn.Module, lora_r=0, lora_alpha=1., lora_dropout=0.): | |||
| for name, module in model.named_children(): | |||
| if name == "c_attn": | |||
| new_layer = MYDPMergedLinear.from_transformers_conv1d( | |||
| original_layer=module, | |||
| lora_r=lora_r, | |||
| lora_alpha=lora_alpha, | |||
| lora_dropout=lora_dropout, | |||
| ) | |||
| setattr(model, name, new_layer) | |||
| else: | |||
| mutate_model(module, lora_r, lora_alpha, lora_dropout) # recursively call the function on the module | |||
| def mark_only_lora_as_trainable(model: torch.nn.Module) -> None: | |||
| model.requires_grad_(True) | |||
| for n, p in model.named_parameters(): | |||
| if 'lora_' not in n: | |||
| p.requires_grad = False | |||
| class AdapterLayer(nn.Module): | |||
| def __init__( | |||
| self, | |||
| emb_dim: int, | |||
| bottleneck_size: int, | |||
| bias = True | |||
| ): | |||
| super().__init__() | |||
| self.sharif_llm_adapter = nn.Sequential( | |||
| nn.Linear(emb_dim, bottleneck_size, bias=bias), | |||
| nn.ReLU(), | |||
| nn.Linear(bottleneck_size, emb_dim, bias=bias) | |||
| ) | |||
| def forward(self, x: torch.Tensor): | |||
| output = x + self.sharif_llm_adapter(x) | |||
| return output | |||
| class FeedForwardAdapterWrapper(nn.Module): | |||
| def __init__( | |||
| self, | |||
| original_module: GPT2MLP, | |||
| bottleneck_size: int, | |||
| emb_dim, | |||
| bias = True | |||
| ): | |||
| super().__init__() | |||
| assert isinstance(original_module, GPT2MLP) | |||
| self.original_module = original_module | |||
| self.adapter = AdapterLayer(emb_dim, bottleneck_size, bias=bias) | |||
| def forward(self, x: torch.Tensor): | |||
| output = self.original_module(x) | |||
| output = self.adapter(output) | |||
| return output | |||
| def mutate_model_recursive_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||
| for name, module in model.named_children(): | |||
| if isinstance(module, GPT2MLP): | |||
| feed_forward_with_adapter = FeedForwardAdapterWrapper(module, bottleneck_size, emb_dim, bias) | |||
| setattr(model, name, feed_forward_with_adapter) | |||
| else: | |||
| mutate_model_recursive_adapter(module, bottleneck_size, emb_dim, bias) # recursively call the function on the module | |||
| def mutate_model_adapter(model: nn.Module, bottleneck_size: int, emb_dim, bias=True): | |||
| if hasattr(model, '_mutated'): | |||
| print("Model already contains adapter layers! \n Try reloading the model.") | |||
| return | |||
| mutate_model_recursive_adapter(model, bottleneck_size, emb_dim, bias) | |||
| model._mutated = True | |||
| @@ -0,0 +1,211 @@ | |||
| from transformers import get_linear_schedule_with_warmup | |||
| import logging | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from torch.optim import AdamW | |||
| from tqdm import tqdm | |||
| from opacus import PrivacyEngine | |||
| from opacus.utils.batch_memory_manager import BatchMemoryManager | |||
| import wandb | |||
| import math | |||
| from model import save_model | |||
| from torch.optim.lr_scheduler import StepLR | |||
| class Trainer: | |||
| def __init__(self, cfg, model, train_loader, checkpoint=None, second_trainer=False): | |||
| if second_trainer: | |||
| self.epochs = cfg.epochs_two | |||
| self.lr = cfg.lr_two | |||
| self.weight_decay = cfg.weight_decay_two | |||
| else: | |||
| self.epochs = cfg.epochs | |||
| self.lr = cfg.lr | |||
| self.weight_decay = cfg.weight_decay | |||
| self.optimizer = AdamW(model.parameters(), lr=self.lr, weight_decay=self.weight_decay, eps=cfg.optimizer_eps) | |||
| self.gradient_accumulation_steps = cfg.virtual_batch_size // cfg.batch_size | |||
| total_steps = len(train_loader) * self.gradient_accumulation_steps * self.epochs | |||
| if cfg.scheduler: | |||
| if cfg.scheduler_type == "linear": | |||
| warmup_steps = cfg.scheduler_warmup_steps if cfg.scheduler_warmup_steps else cfg.scheduler_warmup_ratio*total_steps | |||
| self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) | |||
| elif cfg.scheduler_type == "steplr": | |||
| self.scheduler = StepLR(self.optimizer, step_size=cfg.scheduler_step_size, gamma=cfg.scheduler_gamma) | |||
| self.dp = cfg.dp | |||
| self.model = model | |||
| self.cfg = cfg | |||
| self.save_path = f"{cfg.media_path}generation_saved_models/{cfg.dataset}/{cfg.peft_mode}" | |||
| self.model_name = self.cfg.run_name if self.cfg.run_name else "best_model" | |||
| if cfg.dp: | |||
| self.model.train() | |||
| self.privacy_engine = PrivacyEngine( | |||
| accountant="rdp", | |||
| ) | |||
| if checkpoint: | |||
| self.privacy_engine.load_checkpoint(path=checkpoint, module=self.model) | |||
| self.model, self.optimizer, _ = self.privacy_engine.make_private_with_epsilon( | |||
| module=self.model, | |||
| optimizer=self.optimizer, | |||
| data_loader=train_loader, | |||
| target_epsilon=cfg.epsilon, | |||
| target_delta=cfg.delta, | |||
| epochs=self.epochs, | |||
| max_grad_norm=cfg.clipping_threshold, | |||
| ) | |||
| def train_step(self, train_loader): | |||
| train_loss = 0 | |||
| self.model.train() | |||
| self.optimizer.zero_grad() | |||
| if self.dp: | |||
| with BatchMemoryManager(data_loader=train_loader, max_physical_batch_size=self.cfg.batch_size, optimizer=self.optimizer) as new_data_loader: | |||
| for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
| # Move batch tensors to the same device as the model | |||
| batch = prepare_inputs(batch) | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| # Forward pass | |||
| outputs = self.model(**batch) | |||
| loss = outputs.loss | |||
| loss.backward() | |||
| train_loss += loss.item() | |||
| self.optimizer.step() | |||
| self.optimizer.zero_grad() | |||
| if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||
| self.scheduler.step() | |||
| if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||
| self.scheduler.step() | |||
| else: | |||
| for batch_number, batch in tqdm(enumerate(new_data_loader, 1), total=len(new_data_loader)): | |||
| # Move batch tensors to the same device as the model | |||
| batch = prepare_inputs(batch) | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| # Forward pass | |||
| outputs = self.model(**batch) | |||
| loss = outputs.loss | |||
| loss.backward() | |||
| train_loss += loss.item() | |||
| self.optimizer.step() | |||
| self.optimizer.zero_grad() | |||
| if self.cfg.scheduler and self.cfg.scheduler_type == "linear": | |||
| self.scheduler.step() | |||
| if self.cfg.scheduler and self.cfg.scheduler_type == "steplr": | |||
| self.scheduler.step() | |||
| return train_loss/len(train_loader) | |||
| def evaluate_step(self, val_loader): | |||
| # Evaluation loop | |||
| val_loss = 0 | |||
| self.model.eval() | |||
| with torch.no_grad(): | |||
| for batch in tqdm(val_loader): | |||
| # Move batch tensors to the same device as the model | |||
| batch = prepare_inputs(batch) | |||
| batch = {k: v.to(self.cfg.device) for k, v in batch.items()} | |||
| outputs = self.model(**batch) | |||
| loss = compute_loss_per_input(outputs, batch) | |||
| val_loss += loss.mean().item() | |||
| return val_loss/len(val_loader) | |||
| def train_and_evaluate(self, epochs, train_loader, val_loader): | |||
| best_validation_loss = None | |||
| best_epoch = 0 | |||
| wandb_log = [] | |||
| for epoch in range(epochs): | |||
| log_data = {} | |||
| train_loss = self.train_step(train_loader) | |||
| log_data["train_loss"] = train_loss | |||
| logging.info(f"Epoch {epoch+1} Training loss: {train_loss}") | |||
| val_loss = self.evaluate_step(val_loader=val_loader) | |||
| log_data["validation_loss"] = val_loss | |||
| logging.info(f"Epoch {epoch+1} Validation loss: {val_loss}") | |||
| if best_validation_loss is None or val_loss < best_validation_loss: | |||
| best_validation_loss = val_loss | |||
| best_epoch = epoch | |||
| save_model(self.model, self.cfg.peft_mode, self.save_path, self.model_name) | |||
| logging.info(f"Model improved and saved for epoch {epoch+1}") | |||
| wandb_log.append(log_data) | |||
| logging.info("Best results:") | |||
| if self.cfg.dp: | |||
| logging.info(self.privacy_engine.accountant.get_epsilon(delta=self.cfg.delta)) | |||
| logging.info(f"Best validatin loss: {best_validation_loss} for Epoch: {best_epoch+1}") | |||
| if self.cfg.use_wandb: | |||
| for i, epoch_data in enumerate(wandb_log): | |||
| wandb.log(epoch_data) | |||
| def prepare_inputs(batch): | |||
| batch.pop('src_attn', None) | |||
| batch.pop('tgt_attn', None) | |||
| batch.pop('src', None) | |||
| return batch | |||
| def compute_loss_per_input(outputs, batch): | |||
| logits = outputs.logits | |||
| shift_logits = logits[..., :-1, :].contiguous() | |||
| shift_labels = batch["labels"][..., 1:].contiguous() | |||
| seq_lens = (shift_labels != -100).sum(dim=1) | |||
| loss = F.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none") | |||
| loss = loss.sum(dim=1) / seq_lens | |||
| return loss | |||
| def save_evaluation_output(outputs, path): | |||
| with open(path, "w") as file: | |||
| for strings in outputs: | |||
| for string in strings: | |||
| file.write(string + "\n") | |||
| # file.write("\n") | |||
| file.close() | |||
| def generate_evaluation_output(model, tokenizer, data, device, max_length, beam_size=5, do_sample=False, num_return_sequences=1): | |||
| generated_texts = [] | |||
| prev = None | |||
| for entry in tqdm(data): | |||
| if prev != entry["meaning_representation"]: | |||
| prev = entry["meaning_representation"] | |||
| prompt = f"{entry['meaning_representation']} {tokenizer.eos_token}" | |||
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True) | |||
| inputs = {key: val.to(device) for key, val in inputs.items()} | |||
| with torch.no_grad(): | |||
| outputs = model.generate(**inputs, | |||
| num_beams=beam_size, | |||
| max_length=max_length, | |||
| do_sample=do_sample, | |||
| early_stopping=True, | |||
| min_length=5, | |||
| num_return_sequences=num_return_sequences, | |||
| bad_words_ids = [[628], [198], [tokenizer.pad_token_id]], | |||
| pad_token_id=tokenizer.eos_token_id, | |||
| repetition_penalty=1, | |||
| top_k=0, | |||
| top_p=0.9) | |||
| temp_generated_texts = [] | |||
| for output in outputs: | |||
| generated_text = tokenizer.decode(output[len(inputs["input_ids"][0]):], skip_special_tokens=True) | |||
| temp_generated_texts.append(generated_text.strip()) | |||
| generated_texts.append(temp_generated_texts) | |||
| return generated_texts | |||
| @@ -0,0 +1,42 @@ | |||
| import torch | |||
| def clean_hyperparameters(hyperparameters: dict): | |||
| if hyperparameters["scheduler"] == 0: | |||
| hyperparameters.pop("scheduler_type", None) | |||
| hyperparameters.pop("scheduler_warmup_ratio", None) | |||
| hyperparameters.pop("scheduler_warmup_steps", None) | |||
| hyperparameters.pop("scheduler_step_size", None) | |||
| hyperparameters.pop("scheduler_gamma", None) | |||
| if hyperparameters["peft_mode"] != "lora": | |||
| hyperparameters.pop("rank", None) | |||
| hyperparameters.pop("alpha", None) | |||
| hyperparameters.pop("drop_out", None) | |||
| if hyperparameters["peft_mode"] != "adapter" and hyperparameters["peft_mode"] != "adapterbitfit": | |||
| hyperparameters.pop("reduction_factor", None) | |||
| if hyperparameters["dp"] == 0: | |||
| hyperparameters.pop("epsilon", None) | |||
| hyperparameters.pop("delta", None) | |||
| hyperparameters.pop("clipping_mode", None) | |||
| hyperparameters.pop("clipping_threshold", None) | |||
| if hyperparameters["use_wandb"] == 0: | |||
| hyperparameters.pop("wandb_project_name", None) | |||
| hyperparameters.pop("use_wandb", None) | |||
| if hyperparameters["two_step_training"] == 0: | |||
| hyperparameters.pop("lr_two", None) | |||
| hyperparameters.pop("virtual_batch_size_two", None) | |||
| hyperparameters.pop("epochs_two", None) | |||
| hyperparameters.pop("weight_decay_two", None) | |||
| hyperparameters.pop("f", None) | |||
| hyperparameters.pop("media_path", None) | |||
| hyperparameters.pop("model_cache_path", None) | |||
| return hyperparameters | |||
| def copy_model_weights(model1, model2): | |||
| model1.eval() | |||
| model2.eval() | |||
| params1 = model1.parameters() | |||
| params2 = model2.parameters() | |||
| with torch.no_grad(): | |||
| for param1, param2 in zip(params1, params2): | |||
| param2.data.copy_(param1.data) | |||
| @@ -0,0 +1,3 @@ | |||
| # Privacy-Preserving Fine-tuning of Parameter-Efficient Language Models | |||
| Details comming soon! | |||