# Intro

In [1]:
from transformers import GPT2TokenizerFast, GPT2Model, DataCollatorWithPadding
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
import torch
import torch.nn as nn
from utils import print_system_info
from typing import Literal, Optional, List, Dict, Callable
from types import SimpleNamespace
from dataclasses import dataclass

print_system_info()

Python version is: 3.10.11
Torch version is: 1.13.1+cu117
Nvidia device is: NVIDIA GeForce RTX 4090
Transformers version is: 4.32.1
Adapterhub not found!!!


In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODEL_NAME = 'gpt2'
NAMESPACE = 'sadcl'

INIT_TEXT = "sentiment or value or relation of the previous text is"
N_LAST_LAYERS = 10

N_TOKENS = 5

# Class

In [3]:
def initialize_embedding(
    emb_dim: int,
    n_tokens: int, 
    random_range: float,
    initialize_from: Optional[torch.Tensor]
):
    if initialize_from is None:
        return torch.FloatTensor(n_tokens, emb_dim).uniform_(-random_range, random_range)

    assert initialize_from.shape[0] >= n_tokens
    assert initialize_from.shape[1] == emb_dim
    return initialize_from[:n_tokens, :].detach().clone()

class SoftEmbedding(nn.Module):
    def __init__(
        self,
        emb_dim: int,
        n_tokens: int,
        first_layer_flag: bool = False,
        random_range: float = 0.1,
        initialize_from: Optional[torch.Tensor] = None
    ):
        super().__init__()
        
        self.emb_dim = emb_dim
        self.n_tokens = n_tokens
        self.first_layer_flag = first_layer_flag
        
        self.sadcl_learned_embedding = nn.parameter.Parameter(
            initialize_embedding(
                emb_dim,
                n_tokens,
                random_range,
                initialize_from
            )
        )
        # self.sadcl_mlp = nn.Sequential(
        #     nn.Linear(emb_dim, 24, bias=False),
        #     nn.ReLU(),
        #     nn.Linear(24, 768, bias=False)
        # )

        assert self.sadcl_learned_embedding.shape == (n_tokens, emb_dim)
            
    def forward(self, input_embedding, attention_mask, sequnce_lengths):
        # input_embedding.shape = (batch_size, num_of_input_tokens+n_tokens, emb_dim)
        # output_embedding = []
        
        learned_embedding = self.sadcl_learned_embedding# + self.sadcl_mlp(self.sadcl_learned_embedding)
        
        batch_size = input_embedding.size(0)
        learned_embedding = learned_embedding.repeat(batch_size, 1, 1)  # (batch_size, n_tokens, emb_dim)
        
        attention_mask_shift = torch.zeros((batch_size, 1, 1, self.n_tokens), device=attention_mask.device)
        attention_mask = torch.cat([attention_mask_shift, attention_mask[:, :, :, :-self.n_tokens]], dim=-1)
        if self.first_layer_flag:
            output_embedding = torch.cat([learned_embedding, input_embedding[:, :-self.n_tokens]], dim=1)
        else:
            output_embedding = torch.cat([learned_embedding, input_embedding[:, self.n_tokens:]], dim=1)
        # print(attention_mask == 0)
        return output_embedding, attention_mask
        
    def get_weights(self):
        return self.sadcl_learned_embedding.detach().clone()


class GPT2ModuleWrapper(nn.Module):
    def __init__(
        self,
        module,
        emb_dim:int,
        n_tokens:int,
        get_sequnce_lengths:int,
        first_layer_flag:bool,
        initialize_from:Optional[torch.Tensor] = None
    ):
        super().__init__()
        self.original_module = module
        self.soft_prompt = SoftEmbedding(
            emb_dim=emb_dim,
            n_tokens=n_tokens,
            first_layer_flag=first_layer_flag,
            initialize_from=initialize_from
        )
        self.get_sequnce_lengths = get_sequnce_lengths
        
        
    def forward(self, hidden_states, *args, **kwargs):
        output_embedding, attention_mask = self.soft_prompt(
            hidden_states,
            kwargs['attention_mask'],
            self.get_sequnce_lengths()
        )
        kwargs['attention_mask'] = attention_mask
        return self.original_module(output_embedding, *args, **kwargs)

class GPT2Injector:
    def __init__(self):
        self.sequnce_lengths = None
    
    def get_sequnce_lengths(self):
        return self.sequnce_lengths
    
    def _mutate_model_forward(self, model):
        old_forward = model.forward
        pad_token_id = model.config.pad_token_id
        def new_forward(*args, **kwargs):
            input_ids = kwargs['input_ids']
            self.sequnce_lengths = (
                torch.eq(input_ids, pad_token_id).long().argmax(-1) - 1
            ).detach().cpu().tolist()
            return old_forward(*args, **kwargs)
        model.forward = new_forward
    
    def _reverse_mutate_model_forward(self, model):
        orig_class = type(model)
        model.forward = orig_class.forward.__get__(model, orig_class)
    
    def mutate(self, model, n_layers, n_tokens, init_prompts):
        self._mutate_model_forward(model)
        module_list = manager.model.h
        start = len(module_list) - n_layers
        for idx in range(start, len(module_list)):
            module_list[idx] = GPT2ModuleWrapper(
                module=module_list[idx],
                emb_dim=model.embed_dim,
                n_tokens=n_tokens,
                get_sequnce_lengths=self.get_sequnce_lengths,
                first_layer_flag=(idx == start),
                initialize_from=init_prompts[idx][0]
            )
        return module_list[start:]
    
    def reverse_mutate(self, model):
        self._reverse_mutate_model_forward(model)
        module_list = model.h
        for idx in range(len(module_list)):
            if type(module_list[idx]) is GPT2ModuleWrapper:
                module_list[idx] = module_list[idx].original_module


In [4]:
class MixHeadModel(nn.Module):
    def __init__(self, model, head):
        super().__init__()
        self.model = model
        self.sadcl_head = head
    
    def forward(self, *args, **kwargs):
        labels = kwargs.pop('labels', None)
        transformer_outputs = self.model(*args, **kwargs)
        out = self.sadcl_head(
            transformer_outputs=transformer_outputs,
            labels=labels
        )
        return out

In [5]:
class GPT2ClassificationHead(nn.Module):
    def __init__(
        self,
        emb_dim: int,
        n_labels: int,
        get_sequnce_lengths: Callable[[], List[int]],
        n_tokens: int,
        init_range: float,
        bias=True
    ):
        super().__init__()
        
        self.get_sequnce_lengths = get_sequnce_lengths
        self.n_labels = n_labels
        self.n_tokens = n_tokens
        self.loss_func = nn.CrossEntropyLoss()
        
        self.score = nn.Linear(emb_dim, n_labels, bias)  # Bias is false in huggingface implementation
        
        self._init_weights(init_range)
        
    def _init_weights(self, init_range):
        self.score.weight.data.normal_(mean=0.0, std=init_range)
        if self.score.bias is not None:
            self.score.bias.data.zero_()
        
    def forward(self, transformer_outputs, labels=None):
        last_text_token_per_batch = self.get_sequnce_lengths()
        last_prompt_token_per_batch = [
            seqlen + self.n_tokens for seqlen in last_text_token_per_batch
        ]
        last_hidden_state = transformer_outputs.last_hidden_state
        batch_size = last_hidden_state.size(0)
        
        # last_text_token = last_hidden_state[range(batch_size), last_text_token_per_batch]
        last_prompt_token = last_hidden_state[range(batch_size), last_prompt_token_per_batch]
        logits = self.score(last_prompt_token)
        
        loss = None
        if labels is not None:
            loss = self.loss_func(logits.view(-1, self.n_labels), labels.view(-1))
        
        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [6]:
@dataclass
class PEFTConfig:
    name: str
    kind: Literal['regression', 'classification', 'generation']
    n_labels: Optional[int]  # only for classification
    @classmethod
    def classification(cls, name: str, n_labels: int):
        return cls(name=name, n_labels=n_labels, kind='classification')

class GPT2LLL:
    def __init__(
        self,
        n_tokens=N_TOKENS,
        n_last_layers=N_LAST_LAYERS,
        model_name=MODEL_NAME,
        device=DEVICE,
        init_text=INIT_TEXT
    ):
        self.n_tokens = n_tokens
        self.n_last_layers = n_last_layers
        self.model_name = model_name
        self.device = device
                
        self.pefts = {}
        
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_name, add_prefix_space=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model = GPT2Model.from_pretrained(model_name, pad_token_id=self.tokenizer.pad_token_id)
        self.model.to(device);
        
        init_tokens = self.tokenizer(init_text, return_tensors='pt').to(device)
        with torch.no_grad():
            self.init_prompts = self.model(**init_tokens, output_hidden_states=True).hidden_states
                
        self.current_peft_name = None
        self.current_mix_model = None
    
    @property
    def current_peft(self):
        if self.current_peft_name is None:
            return None
        return self.pefts[self.current_peft_name]
        
    def generate_tokenizer_map(self):
        n_tokens = self.n_tokens
        tokenizer = self.tokenizer
        def return_function(rows):
            outputs_dict = tokenizer(rows)
            for row in outputs_dict['input_ids']:
                row.extend([tokenizer.pad_token_id] * n_tokens)
            for row in outputs_dict['attention_mask']:
                row.extend([0] * n_tokens)
            return outputs_dict
        return return_function
    
    def activate_peft(self, name):
        self.current_peft_name = name
        
        self.current_peft.injector.mutate(
            model=self.model,
            n_layers=self.n_last_layers,
            n_tokens=self.n_tokens,
            init_prompts=self.init_prompts
        )
        self.current_mix_model = MixHeadModel(
            head=self.current_peft.head,
            model=self.model
        )
    
    def auto_freeze(self):
        print("Unfreezed params are:")
        for param_name, weights in self.current_mix_model.named_parameters():
            if NAMESPACE in param_name:
                weights.requires_grad = True
                print("- " + param_name)
            else:
                weights.requires_grad = False
    
    def add_peft(self, config: PEFTConfig):
        assert config.name not in self.pefts
        injector = GPT2Injector()
        head = GPT2ClassificationHead(
            emb_dim=self.model.embed_dim,
            n_labels=config.n_labels,
            get_sequnce_lengths=injector.get_sequnce_lengths,
            n_tokens=self.n_tokens,
            init_range=self.model.config.initializer_range,
            bias=False
        )
        head.to(self.device)
        self.pefts[config.name] = SimpleNamespace(
            head=head,
            injector=injector
        )

# Train

## Prepare

In [7]:
peft_name = 'peft1'

manager = GPT2LLL()
manager.add_peft(PEFTConfig.classification(name=peft_name, n_labels=2))
manager.activate_peft(peft_name)
manager.auto_freeze()

Unfreezed params are:
- model.h.2.soft_prompt.sadcl_learned_embedding
- model.h.3.soft_prompt.sadcl_learned_embedding
- model.h.4.soft_prompt.sadcl_learned_embedding
- model.h.5.soft_prompt.sadcl_learned_embedding
- model.h.6.soft_prompt.sadcl_learned_embedding
- model.h.7.soft_prompt.sadcl_learned_embedding
- model.h.8.soft_prompt.sadcl_learned_embedding
- model.h.9.soft_prompt.sadcl_learned_embedding
- model.h.10.soft_prompt.sadcl_learned_embedding
- model.h.11.soft_prompt.sadcl_learned_embedding
- sadcl_head.score.weight


In [8]:
from config import load_config
config = load_config('config.yaml')

In [9]:
from datasets import load_dataset
dataset = load_dataset('glue', 'cola')
tokenizer_map = manager.generate_tokenizer_map()
dataset = dataset.map(lambda x: tokenizer_map(x['sentence']), batched=True)
dataset.set_format(type='torch', columns=[
    'input_ids', 'attention_mask', 'label' # 'token_type_ids',
])

Found cached dataset glue (/home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-f7a02c6d65621ecd.arrow
Loading cached processed dataset at /home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c36341ab82d2d37d.arrow
Loading cached processed dataset at /home/mohalisad/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9f7663dac81ea13b.arrow


## Training

In [15]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import classification_report


def compute_metrics(pred):
    true_labels = pred.label_ids.ravel()
    pred_labels = pred.predictions.argmax(-1).ravel()
    report = classification_report(true_labels, pred_labels, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'f1-score-1': report['1']['f1-score'],
        'f1-score-ma': report['macro avg']['f1-score']
    }

col_fn = DataCollatorWithPadding(
    manager.tokenizer, return_tensors='pt', padding='longest'
)

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # The next 2 lines are important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    label_names=['labels'],
    **{
        'output_dir': '/disks/part4/trash',
         'num_train_epochs': 160,
         'learning_rate': 0.00001,
         'per_device_train_batch_size': 32,
         'per_device_eval_batch_size': 32
    }
)

trainer = Trainer(
    model=manager.current_mix_model, # manager.current_mix_model
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=col_fn,
    compute_metrics=compute_metrics
)
# trainer.label_names = ['labels']
trainer.train(ignore_keys_for_eval=["past_key_values"])

Epoch,Training Loss,Validation Loss,Accuracy,F1-score-1,F1-score-ma
1,No log,0.617917,0.691275,0.817253,0.411713
2,0.618200,0.620259,0.691275,0.817253,0.411713
3,0.618200,0.612236,0.691275,0.817253,0.411713
4,0.616500,0.613789,0.691275,0.817253,0.411713
5,0.616500,0.615989,0.691275,0.817253,0.411713
6,0.612800,0.614961,0.691275,0.817253,0.411713
7,0.612800,0.612622,0.691275,0.817253,0.411713
8,0.611300,0.613691,0.691275,0.817253,0.411713
9,0.611300,0.613889,0.691275,0.817253,0.411713
10,0.609400,0.616157,0.691275,0.817253,0.411713


KeyboardInterrupt: 

In [14]:
import numpy as np
np.mean(dataset['validation']['label'].numpy())

0.6912751677852349

# debug

In [None]:
inputs = col_fn(dataset['validation'][0:50]).to(DEVICE)
outputs = manager.current_mix_model(**inputs)
outputs.loss.backward()

In [None]:
for i in range(6, 12):
    o = manager.current_mix_model.model.h[i].soft_prompt.sadcl_learned_embedding.grad.abs().sum().item()
    print(i, o)

In [None]:
manager.current_mix_model.sadcl_head.score.weight.grad

In [None]:
raise Exception()

In [None]:
from transformers import GPT2ForSequenceClassification

mtest = GPT2ForSequenceClassification.from_pretrained('gpt2', pad_token_id=manager.tokenizer.pad_token_id)
mtest.to(DEVICE)

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # The next 2 lines are important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    label_names=['labels'],
    **
    {
        'output_dir': '/home/mohalisad/Developer/Thesis/cp3',
         'num_train_epochs': 80,
         'learning_rate': 0.00001,
         'per_device_train_batch_size': 32,
         'per_device_eval_batch_size': 32
    }
)

trainer = Trainer(
    model=mtest, # manager.current_mix_model
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=col_fn,
    compute_metrics=compute_metrics
)
# trainer.label_names = ['labels']
trainer.train()

# Trash

In [None]:
def map_inputs(str_list):
    tokens = manager.generate_tokenizer_map()(str_list)
    col_fn = DataCollatorWithPadding(manager.tokenizer)
    return col_fn(tokens).to(DEVICE)
    
inputs = map_inputs(["Hello, my dog is cute", "bye", "why are"])
label = torch.tensor([0, 1, 1], device=DEVICE)
outputs = manager.current_mix_model(label=label, **inputs)

In [None]:
token_ids = manager.tokenizer(INIT_TEXT, return_tensors='pt')['input_ids'].to(DEVICE)

In [None]:
token_ids

In [None]:
manager.model.wte(token_ids).shape

In [None]:
outputs.loss

In [None]:
manager.model.h[9].original_module.attn.c_attn.weight.grad

In [None]:
(torch.tensor([0, 1, 0]) == 0).any()


In [None]:
manager.model.h[9].soft_prompt.sadcl_learned_embedding.grad

In [None]:
outputs.last_hidden_state.sum().backward()

In [None]:


last_hidden_states = outputs.last_hidden_state
inputs = tokenizer(["Hello, my dog is cute", "bye"])
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [None]:
inputs

In [None]:

inputs = tokenize_dataset(["Hello, my dog is cute", "bye"])
inputs

In [None]:
tokenizer.eos_token

In [None]:
x = nn.Parameter(torch.arange(27).reshape(3, 3, 3).float())
x

In [None]:
b = nn.Parameter(torch.tensor([7, 7, 7]).float())
b

In [None]:
x[0, 0, :] = torch.tensor([7, 7, 7])