# intro

In [1]:
from typing import Optional

import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from transformers import T5TokenizerFast, T5ForConditionalGeneration

from _utils import print_system_info, generate_dataloader
from _datasets import AutoLoad
from _mydelta import T5Wrapper, auto_freeze
from _trainer import train_loop, valid_loop

print_system_info()

Python version is: 3.10.11
Scikit-learn version is: 1.2.2
Torch version is: 1.13.1+cu117
Nvidia device is: NVIDIA GeForce RTX 4090
Transformers version is: 4.32.1
Adapterhub not found!!!


In [2]:
from types import SimpleNamespace
config = SimpleNamespace(
    model_name='google/t5-base-lm-adapt',
    n_tokens=30,
    n_layers=6,
    random_seed=42,
    task=['glue:cola'],
    hot_modules=['sadcl'],
    train_batch_size=32,
    valid_batch_size=32,
    balancify_sample=False,
    learning_rate=0.01,
    num_epochs=200
)

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

np.random.seed(config.random_seed)
slected_tokens = torch.from_numpy(np.random.randint(0, 32128, size=(config.n_tokens,)))

# load model and date

In [4]:
model = T5ForConditionalGeneration.from_pretrained(config.model_name)
tokenizer = T5TokenizerFast.from_pretrained(config.model_name, model_max_length=2048)

In [5]:
delta_module = T5Wrapper.mutate(
    model=model,
    config=config,
    slected_tokens=slected_tokens
)
auto_freeze(model, config.hot_modules, verbose=True)

encoder.block.6.soft_prompt.sadcl_learned_embedding
encoder.block.7.soft_prompt.sadcl_learned_embedding
encoder.block.8.soft_prompt.sadcl_learned_embedding
encoder.block.9.soft_prompt.sadcl_learned_embedding
encoder.block.10.soft_prompt.sadcl_learned_embedding
encoder.block.11.soft_prompt.sadcl_learned_embedding


In [15]:
for x, y in model.named_parameters():
    print(x)

shared.weight
encoder.block.0.layer.0.SelfAttention.q.weight
encoder.block.0.layer.0.SelfAttention.k.weight
encoder.block.0.layer.0.SelfAttention.v.weight
encoder.block.0.layer.0.SelfAttention.o.weight
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight
encoder.block.0.layer.0.layer_norm.weight
encoder.block.0.layer.1.DenseReluDense.wi_0.weight
encoder.block.0.layer.1.DenseReluDense.wi_1.weight
encoder.block.0.layer.1.DenseReluDense.wo.weight
encoder.block.0.layer.1.layer_norm.weight
encoder.block.1.layer.0.SelfAttention.q.weight
encoder.block.1.layer.0.SelfAttention.k.weight
encoder.block.1.layer.0.SelfAttention.v.weight
encoder.block.1.layer.0.SelfAttention.o.weight
encoder.block.1.layer.0.layer_norm.weight
encoder.block.1.layer.1.DenseReluDense.wi_0.weight
encoder.block.1.layer.1.DenseReluDense.wi_1.weight
encoder.block.1.layer.1.DenseReluDense.wo.weight
encoder.block.1.layer.1.layer_norm.weight
encoder.block.2.layer.0.SelfAttention.q.weight
encoder.block.2.layer.0.

In [6]:
data_loader = AutoLoad(tokenizer)
dataset = data_loader.get_and_map(config.task[0])
train_loader, valid_loader = generate_dataloader(tokenizer, dataset['train'], dataset['valid'], config)

Found cached dataset glue (/home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-63df8ebe4567b55a.arrow
Loading cached processed dataset at /home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-bb3872c77bcda3cd.arrow


In [7]:
# model(**next(iter(train_loader))).loss.backward()
# for i in range(6, 12):
#     o = model.encoder.block[i].soft_prompt.sadcl_learned_embedding.grad.abs().sum().item()
#     print(i, o)

# train model

In [10]:
import wandb
wandb.init(
    # set the wandb project where this run will be logged
    project="my-awesome-project",
    # track hyperparameters and run metadata
    config=config.__dict__
)


KeyboardInterrupt



In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

model.to(DEVICE)

for epoch in range(config.num_epochs):
    train_out = train_loop(model=model, loader=train_loader, optimizer=optimizer)
    valid_out = valid_loop(model=model, loader=valid_loader)
    wandb.log({
        **train_out,
        **valid_out
    })
    
wandb.finish()

  0%|                                                                                                        | 0/268 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:08<00:00, 32.56it/s]


{'train_loss': 8.963883996009827, 'valid_loss': 6.972279635342685, 'valid_accuracy': 0.0, 'valid_f1-score-1': 0.0, 'valid_f1-score-ma': 0.0}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 34.00it/s]


{'train_loss': 7.36324492141382, 'valid_loss': 5.521347826177424, 'valid_accuracy': 0.0, 'valid_f1-score-1': 0.0, 'valid_f1-score-ma': 0.0}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.97it/s]


{'train_loss': 6.192735992260833, 'valid_loss': 4.384567484711155, 'valid_accuracy': 0.0, 'valid_f1-score-1': 0.0, 'valid_f1-score-ma': 0.0}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.89it/s]


{'train_loss': 5.118913385405469, 'valid_loss': 3.335551644816543, 'valid_accuracy': 0.05465004793863854, 'valid_f1-score-1': 0.14091470951792337, 'valid_f1-score-ma': 0.009394313967861558}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.93it/s]


{'train_loss': 4.148920764674002, 'valid_loss': 2.2682720783985024, 'valid_accuracy': 0.174496644295302, 'valid_f1-score-1': 0.36804853387259856, 'valid_f1-score-ma': 0.02164991375721168}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.94it/s]


{'train_loss': 3.1643025679374808, 'valid_loss': 1.2784492048350247, 'valid_accuracy': 0.5148609779482263, 'valid_f1-score-1': 0.7208053691275169, 'valid_f1-score-ma': 0.05544656685596284}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.87it/s]


{'train_loss': 2.235221519843856, 'valid_loss': 0.6245457141688375, 'valid_accuracy': 0.6625119846596357, 'valid_f1-score-1': 0.7915151515151515, 'valid_f1-score-ma': 0.09733333333333333}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.91it/s]


{'train_loss': 1.5051592252592543, 'valid_loss': 0.4341738431742697, 'valid_accuracy': 0.6768935762224353, 'valid_f1-score-1': 0.8077147866744595, 'valid_f1-score-ma': 0.0810680363745307}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.99it/s]


{'train_loss': 1.0515665002723238, 'valid_loss': 0.3996329452052261, 'valid_accuracy': 0.6826462128475551, 'valid_f1-score-1': 0.8151116199198626, 'valid_f1-score-ma': 0.08151116199198626}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 34.04it/s]


{'train_loss': 0.8272433252032123, 'valid_loss': 0.3832718174565922, 'valid_accuracy': 0.6855225311601151, 'valid_f1-score-1': 0.8162100456621004, 'valid_f1-score-ma': 0.11660143509458577}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 34.00it/s]


{'train_loss': 0.7063313028705653, 'valid_loss': 0.36713372216080176, 'valid_accuracy': 0.6874400767018217, 'valid_f1-score-1': 0.8175598631698974, 'valid_f1-score-ma': 0.16351197263397949}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 33.92it/s]


{'train_loss': 0.6392747724234168, 'valid_loss': 0.36563855906327564, 'valid_accuracy': 0.6874400767018217, 'valid_f1-score-1': 0.8170940170940172, 'valid_f1-score-ma': 0.2042735042735043}


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:07<00:00, 34.00it/s]


{'train_loss': 0.5930970842713741, 'valid_loss': 0.3603471038919507, 'valid_accuracy': 0.6874400767018217, 'valid_f1-score-1': 0.8170940170940172, 'valid_f1-score-ma': 0.2042735042735043}


 40%|█████████████████████████████████████▏                                                        | 106/268 [00:03<00:04, 33.89it/s]

KeyboardInterrupt



In [None]:
# pip uninstall bitsandbytes -y

In [None]:
dataset['train'].set_format(columns=['label', 'labels'])

In [None]:
dataset['train'][0:100]

In [None]:
from datasets import load_dataset
x = load_dataset("glue", "sst2")

In [None]:
Counter(x['train']['label'])

In [None]:
g = x['train']
l = g.features['label']

In [None]:
l.int2str(1)

In [None]:
29780 / (29780 + 37569)

In [11]:
from types import SimpleNamespace
config = SimpleNamespace(
    model_name='google/t5-base-lm-adapt',
    peft_params={
        'n_tokens': 30,
        'n_layers': 6
    },
    random_seed=42,
    task=['glue:cola'],
    hot_modules=['sadcl'],
    train_batch_size=32,
    valid_batch_size=32,
    balancify_sample=False,
    learning_rate=0.01,
    num_epochs=50
)

In [13]:
import json
print(json.dumps(config.__dict__))

{"model_name": "google/t5-base-lm-adapt", "peft_params": {"n_tokens": 30, "n_layers": 6}, "random_seed": 42, "task": ["glue:cola"], "hot_modules": ["sadcl"], "train_batch_size": 32, "valid_batch_size": 32, "balancify_sample": false, "learning_rate": 0.01, "num_epochs": 50}


In [1]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration


In [2]:
tokenizer = T5TokenizerFast.from_pretrained("google/t5-large-lm-adapt", model_max_length=2048)


In [3]:
import numpy as np

In [4]:
np.random.randint(0, tokenizer.vocab_size, size=(100,))

array([23830,  2611, 19567, 10149, 20142,  6737, 26963,  6788,  3871,
       28330,   724,  7406, 11474, 18399,  2289, 25511, 25299, 23308,
       25412,   370, 32091, 28829,  6148, 29154, 30369, 12979,  8560,
        6872, 23228,  8051, 19537,  3741, 22206, 20744, 17051, 27857,
        3830, 15329, 21857,  8296, 10768,  7854,  5710,  5405, 27449,
       11528,  8599, 12695, 15427, 23726,   389,  3231, 15270, 26906,
       23085, 15113, 31792,  8766,  9814, 15904,  6320, 23716, 19682,
        2690, 30766, 21262, 11415,  2523, 26538,  3647, 13971, 21655,
         287, 19479, 28945, 25134, 17673,  9792, 17556, 31293, 25795,
        2753,  8955, 21049, 28409, 24281,  3610, 26070,  2189, 25611,
        9641, 23766, 29195,   779, 18660, 10731, 19732,  1664,  2176,
        2254])

In [1]:
import torch

In [2]:
w = torch.load('best.pt')

In [8]:
w.pop('sadcl_learned_embedding')

tensor([[ 0.5470, -0.8095, -1.4617,  ...,  0.8100, -1.1746,  0.5768],
        [-0.9284, -0.6230, -2.4697,  ...,  0.3947, -0.5427, -0.3088],
        [ 1.4407,  0.8760,  0.2499,  ...,  0.1860, -0.3176,  2.0041],
        ...,
        [ 0.8714,  1.1013, -2.7711,  ..., -0.2819,  0.7087, -0.6164],
        [ 0.8026, -0.7928, -0.8946,  ..., -1.5204,  1.0164, -1.3527],
        [ 0.4650, -2.1778,  0.0213,  ..., -1.1430, -2.3895, -0.0235]],
       device='cuda:0')

In [9]:
w

OrderedDict([('sadcl_mlp.0.weight',
              tensor([[ 0.1171, -0.7743,  0.5095,  ..., -1.0615,  1.5754,  0.7036],
                      [-0.2675,  0.0969,  0.0543,  ...,  0.7276, -0.0671,  0.8296],
                      [-0.2987, -0.0700, -1.0519,  ...,  0.6090,  0.0193,  0.0410],
                      ...,
                      [-0.1463, -0.8924,  0.7947,  ...,  0.2265, -0.6957,  0.5928],
                      [-0.4365, -0.9251, -1.0378,  ..., -0.8628, -0.5243,  0.0860],
                      [ 0.4860,  0.0648, -0.9160,  ..., -0.5342,  0.1072, -0.1397]],
                     device='cuda:0')),
             ('sadcl_mlp.0.bias',
              tensor([-0.6311, -1.0433, -1.0390, -1.6997, -1.0766, -0.2802, -0.9433, -0.7127,
                       0.5315, -1.0400, -0.3756, -0.2602, -0.7607,  0.7578, -0.7066, -0.3561,
                      -0.5580, -0.7671, -0.2557, -1.6528, -0.1438, -0.4875, -0.6291, -1.2763,
                      -0.2484, -0.6396, -0.7225, -0.8314, -1.3913, -0.7696, 