{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" }, "tags": [] }, "source": [ "# Intro" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "from abc import abstractmethod, ABC\n", "from os import PathLike\n", "from typing import Dict, Union, Optional, Iterable\n", "\n", "\n", "class base_peft(ABC):\n", " def __init__(self, base_model_name: Union[str, PathLike[str]], mask_token_id: int):\n", " self.base_model_name = base_model_name\n", " self.mask_token_id = mask_token_id\n", "\n", " def activate_task_for_training\n", "\n", " @abstractmethod\n", " def finetune_task(self, peft_name: str, train_dataset, validation_dataset):\n", " pass" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2023-08-15T13:16:40.910406Z", "start_time": "2023-08-15T13:16:40.860981Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/mohalisad/Developer/ProgressivePrompts\n" ] } ], "source": [ "cd /home/mohalisad/Developer/ProgressivePrompts" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2023-08-15T13:16:42.467311Z", "start_time": "2023-08-15T13:16:42.313951Z" }, "pycharm": { "is_executing": true, "name": "#%%\n" }, "scrolled": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python version is: 3.9.17\n", "Torch version is: 1.13.1+cu117\n", "Nvidia device is: NVIDIA GeForce RTX 4090\n", "Transformers version is: 4.26.1\n", "Adapterhub version is: 3.2.1\n" ] } ], "source": [ "from utils import print_system_info\n", "print_system_info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "tags": [] }, "outputs": [], "source": [ "from _datasets import AutoLoad\n", "from config import load_config\n", "from _models import BertAdapterModelWrapper, TokenizerMan\n", "\n", "\n", "config = load_config('config.yaml')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file config.json from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/config.json\n", "Model config BertConfig {\n", " \"architectures\": [\n", " \"BertForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 30522\n", "}\n", "\n", "loading weights file model.safetensors from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/model.safetensors\n", "Generate config GenerationConfig {\n", " \"pad_token_id\": 0,\n", " \"transformers_version\": \"4.26.1\"\n", "}\n", "\n", "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertAdapterModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']\n", "- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "All the weights of BertAdapterModel were initialized from the model checkpoint at bert-base-uncased.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use BertAdapterModel for predictions without further training.\n", "Generation config file not found, using a generation config created from the model config.\n", "loading file vocab.txt from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/vocab.txt\n", "loading file tokenizer.json from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/tokenizer.json\n", "loading file added_tokens.json from cache at None\n", "loading file special_tokens_map.json from cache at None\n", "loading file tokenizer_config.json from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/tokenizer_config.json\n", "loading configuration file config.json from cache at /home/mohalisad/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/config.json\n", "Model config BertConfig {\n", " \"_name_or_path\": \"bert-base-uncased\",\n", " \"architectures\": [\n", " \"BertForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 30522\n", "}\n", "\n" ] } ], "source": [ "# import transformers\n", "# transformers.logging.set_verbosity_debug()\n", "adapter_wrapper = BertAdapterModelWrapper(\n", " base_model_name=config.base_model.name,\n", " mask_token_id=config.base_model.mask_token_id\n", ")\n", "tokenizer_man = TokenizerMan(config.base_model.kind, config.base_model.name)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "tags": [] }, "outputs": [], "source": [ "auto_loader = AutoLoad()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f983a58646a54aa6841312408f00f491", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/8551 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "99ea0309b4384a0ab7a458710ae2e443", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1043 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d041fd8948044b5e8b0f761079a04894", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1063 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Adding adapter 'glue:cola'.\n", "Adding head 'glue:cola' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.\n", "PyTorch: setting up devices\n", "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", "/home/mohalisad/anaconda3/envs/lll/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "***** Running training *****\n", " Num examples = 8551\n", " Num Epochs = 15\n", " Instantaneous batch size per device = 32\n", " Total train batch size (w. parallel, distributed & accumulation) = 32\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 4020\n", " Number of trainable parameters = 1486658\n", "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "F1-score-1 | \n", "F1-score-ma | \n", "
---|---|---|---|---|---|
1 | \n", "No log | \n", "0.521243 | \n", "0.772771 | \n", "0.854512 | \n", "0.667956 | \n", "
2 | \n", "0.484900 | \n", "0.475989 | \n", "0.795781 | \n", "0.866290 | \n", "0.717121 | \n", "
3 | \n", "0.484900 | \n", "0.473902 | \n", "0.799616 | \n", "0.868471 | \n", "0.723974 | \n", "
4 | \n", "0.390000 | \n", "0.454408 | \n", "0.815916 | \n", "0.877707 | \n", "0.752807 | \n", "
5 | \n", "0.390000 | \n", "0.460564 | \n", "0.822627 | \n", "0.880414 | \n", "0.768593 | \n", "
6 | \n", "0.330900 | \n", "0.421414 | \n", "0.831256 | \n", "0.883752 | \n", "0.788030 | \n", "
7 | \n", "0.330900 | \n", "0.452820 | \n", "0.833174 | \n", "0.885375 | \n", "0.789519 | \n", "
8 | \n", "0.292000 | \n", "0.465746 | \n", "0.826462 | \n", "0.881777 | \n", "0.777825 | \n", "
9 | \n", "0.292000 | \n", "0.491992 | \n", "0.832215 | \n", "0.885396 | \n", "0.786169 | \n", "
10 | \n", "0.255500 | \n", "0.508437 | \n", "0.827421 | \n", "0.883117 | \n", "0.776723 | \n", "
11 | \n", "0.255500 | \n", "0.519635 | \n", "0.837009 | \n", "0.888889 | \n", "0.791567 | \n", "
12 | \n", "0.232300 | \n", "0.522434 | \n", "0.828380 | \n", "0.883388 | \n", "0.779262 | \n", "
13 | \n", "0.232300 | \n", "0.532363 | \n", "0.835091 | \n", "0.886991 | \n", "0.791013 | \n", "
14 | \n", "0.219900 | \n", "0.557935 | \n", "0.831256 | \n", "0.885566 | \n", "0.782199 | \n", "
15 | \n", "0.202800 | \n", "0.547973 | \n", "0.832215 | \n", "0.885845 | \n", "0.784695 | \n", "
"
],
"text/plain": [
"root\n",
"├── bert (BertModel)\n",
"│ ├── embeddings (BertEmbeddings)\n",
"│ │ ├── word_embeddings (Embedding) weight:[30522, 768]\n",
"│ │ ├── position_embeddings (Embedding) weight:[512, 768]\n",
"│ │ ├── token_type_embeddings (Embedding) weight:[2, 768]\n",
"│ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ ├── encoder (BertEncoder)\n",
"│ │ └── layer (ModuleList)\n",
"│ │ └── 0-11(BertLayer)\n",
"│ │ ├── attention (BertAttention)\n",
"│ │ │ ├── self (BertSelfAttention)\n",
"│ │ │ │ ├── query,key,value(Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ │ └── prefix_tuning (PrefixTuningShim)\n",
"│ │ │ │ └── pool (PrefixTuningPool)\n",
"│ │ │ └── output (BertSelfOutput)\n",
"│ │ │ ├── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ │ ├── intermediate (BertIntermediate)\n",
"│ │ │ └── dense (Linear) weight:[3072, 768] bias:[3072]\n",
"│ │ └── output (BertOutput)\n",
"│ │ ├── dense (Linear) weight:[768, 3072] bias:[768]\n",
"│ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ ├── pooler (BertPooler)\n",
"│ │ └── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ └── prefix_tuning (PrefixTuningPool)\n",
"└── classifier (Linear) weight:[2, 768] bias:[2]\n",
"
\n"
],
"text/plain": [
"\u001b[37mroot\u001b[0m\n",
"├── \u001b[37mbert \u001b[0m\u001b[32m(BertModel)\u001b[0m\n",
"│ ├── \u001b[37membeddings \u001b[0m\u001b[32m(BertEmbeddings)\u001b[0m\n",
"│ │ ├── \u001b[37mword_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[36mweight:[30522, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mposition_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[36mweight:[512, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mtoken_type_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[36mweight:[2, 768]\u001b[0m\n",
"│ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[36mweight:[768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mencoder \u001b[0m\u001b[32m(BertEncoder)\u001b[0m\n",
"│ │ └── \u001b[37mlayer \u001b[0m\u001b[32m(ModuleList)\u001b[0m\n",
"│ │ └── \u001b[31m0-11\u001b[0m\u001b[32m(BertLayer)\u001b[0m\n",
"│ │ ├── \u001b[37mattention \u001b[0m\u001b[32m(BertAttention)\u001b[0m\n",
"│ │ │ ├── \u001b[37mself \u001b[0m\u001b[32m(BertSelfAttention)\u001b[0m\n",
"│ │ │ │ ├── \u001b[31mquery,key,value\u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ │ │ │ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningShim)\u001b[0m\n",
"│ │ │ │ └── \u001b[37mpool \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"│ │ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertSelfOutput)\u001b[0m\n",
"│ │ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ │ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[36mweight:[768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ │ ├── \u001b[37mintermediate \u001b[0m\u001b[32m(BertIntermediate)\u001b[0m\n",
"│ │ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[3072, 768] \u001b[0m\u001b[36mbias:[3072]\u001b[0m\n",
"│ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertOutput)\u001b[0m\n",
"│ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 3072] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[36mweight:[768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mpooler \u001b[0m\u001b[32m(BertPooler)\u001b[0m\n",
"│ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"└── \u001b[37mclassifier \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[2, 768] \u001b[0m\u001b[36mbias:[2]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Visualization(base_model).structure_graph();"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"delta_model = AdapterModel(base_model, bottleneck_dim=48)\n",
"# leave the delta tuning modules and the newly initialized classification head tunable.\n",
"delta_model.freeze_module(exclude=[\"deltas\", \"classifier\"])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"root\n",
"├── bert (BertModel)\n",
"│ ├── embeddings (BertEmbeddings)\n",
"│ │ ├── word_embeddings (Embedding) weight:[30522, 768]\n",
"│ │ ├── position_embeddings (Embedding) weight:[512, 768]\n",
"│ │ ├── token_type_embeddings (Embedding) weight:[2, 768]\n",
"│ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ ├── encoder (BertEncoder)\n",
"│ │ └── layer (ModuleList)\n",
"│ │ └── 0-11(BertLayer)\n",
"│ │ ├── attention (BertAttention)\n",
"│ │ │ ├── self (BertSelfAttention)\n",
"│ │ │ │ ├── query,key,value(Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ │ └── prefix_tuning (PrefixTuningShim)\n",
"│ │ │ │ └── pool (PrefixTuningPool)\n",
"│ │ │ └── output (BertSelfOutput)\n",
"│ │ │ ├── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ │ └── adapter (AdapterLayer)\n",
"│ │ │ │ └── modulelist (Sequential)\n",
"│ │ │ │ ├── down_proj (Linear) weight:[48, 768] bias:[48]\n",
"│ │ │ │ └── up_proj (Linear) weight:[768, 48] bias:[768]\n",
"│ │ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ │ ├── intermediate (BertIntermediate)\n",
"│ │ │ └── dense (Linear) weight:[3072, 768] bias:[3072]\n",
"│ │ └── output (BertOutput)\n",
"│ │ ├── dense (Linear) weight:[768, 3072] bias:[768]\n",
"│ │ │ └── adapter (AdapterLayer)\n",
"│ │ │ └── modulelist (Sequential)\n",
"│ │ │ ├── down_proj (Linear) weight:[48, 768] bias:[48]\n",
"│ │ │ └── up_proj (Linear) weight:[768, 48] bias:[768]\n",
"│ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ ├── pooler (BertPooler)\n",
"│ │ └── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ └── prefix_tuning (PrefixTuningPool)\n",
"└── classifier (Linear) weight:[2, 768] bias:[2]\n",
"
\n"
],
"text/plain": [
"\u001b[37mroot\u001b[0m\n",
"├── \u001b[37mbert \u001b[0m\u001b[32m(BertModel)\u001b[0m\n",
"│ ├── \u001b[37membeddings \u001b[0m\u001b[32m(BertEmbeddings)\u001b[0m\n",
"│ │ ├── \u001b[37mword_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[30522, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mposition_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[512, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mtoken_type_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[2, 768]\u001b[0m\n",
"│ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mencoder \u001b[0m\u001b[32m(BertEncoder)\u001b[0m\n",
"│ │ └── \u001b[37mlayer \u001b[0m\u001b[32m(ModuleList)\u001b[0m\n",
"│ │ └── \u001b[31m0-11\u001b[0m\u001b[32m(BertLayer)\u001b[0m\n",
"│ │ ├── \u001b[37mattention \u001b[0m\u001b[32m(BertAttention)\u001b[0m\n",
"│ │ │ ├── \u001b[37mself \u001b[0m\u001b[32m(BertSelfAttention)\u001b[0m\n",
"│ │ │ │ ├── \u001b[31mquery,key,value\u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ │ │ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningShim)\u001b[0m\n",
"│ │ │ │ └── \u001b[37mpool \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"│ │ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertSelfOutput)\u001b[0m\n",
"│ │ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ │ │ └── \u001b[37madapter \u001b[0m\u001b[32m(AdapterLayer)\u001b[0m\n",
"│ │ │ │ └── \u001b[37mmodulelist \u001b[0m\u001b[32m(Sequential)\u001b[0m\n",
"│ │ │ │ ├── \u001b[37mdown_proj \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;175;0;255mweight:[48, 768] \u001b[0m\u001b[38;2;175;0;255mbias:[48]\u001b[0m\n",
"│ │ │ │ └── \u001b[37mup_proj \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;175;0;255mweight:[768, 48] \u001b[0m\u001b[38;2;175;0;255mbias:[768]\u001b[0m\n",
"│ │ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ ├── \u001b[37mintermediate \u001b[0m\u001b[32m(BertIntermediate)\u001b[0m\n",
"│ │ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[3072, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[3072]\u001b[0m\n",
"│ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertOutput)\u001b[0m\n",
"│ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 3072] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ │ └── \u001b[37madapter \u001b[0m\u001b[32m(AdapterLayer)\u001b[0m\n",
"│ │ │ └── \u001b[37mmodulelist \u001b[0m\u001b[32m(Sequential)\u001b[0m\n",
"│ │ │ ├── \u001b[37mdown_proj \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;175;0;255mweight:[48, 768] \u001b[0m\u001b[38;2;175;0;255mbias:[48]\u001b[0m\n",
"│ │ │ └── \u001b[37mup_proj \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;175;0;255mweight:[768, 48] \u001b[0m\u001b[38;2;175;0;255mbias:[768]\u001b[0m\n",
"│ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mpooler \u001b[0m\u001b[32m(BertPooler)\u001b[0m\n",
"│ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"└── \u001b[37mclassifier \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[2, 768] \u001b[0m\u001b[36mbias:[2]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Visualization(base_model).structure_graph();"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T16:06:44.674950Z",
"start_time": "2023-08-13T16:06:42.233454Z"
}
},
"outputs": [],
"source": [
"from transformers import TrainingArguments, Trainer\n",
"from sklearn.metrics import classification_report\n",
"\n",
"\n",
"def compute_metrics(pred):\n",
" true_labels = pred.label_ids.ravel()\n",
" pred_labels = pred.predictions.argmax(-1).ravel()\n",
" report = classification_report(true_labels, pred_labels, output_dict=True)\n",
" return {\n",
" 'accuracy': report['accuracy'],\n",
" 'f1-score-1': report['1']['f1-score'],\n",
" 'f1-score-ma': report['macro avg']['f1-score']\n",
" }\n",
"\n",
"\n",
"def train_model(input_model, task_name, train_dataset, eval_dataset, col_fn):\n",
" training_args = TrainingArguments(\n",
" evaluation_strategy=\"epoch\",\n",
" save_strategy=\"epoch\",\n",
" # The next 2 lines are important to ensure the dataset labels are properly passed to the model\n",
" remove_unused_columns=False,\n",
" **config.hf_trainer_params.to_dict()\n",
" )\n",
"\n",
" trainer = Trainer(\n",
" model=input_model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=eval_dataset,\n",
" data_collator=col_fn,\n",
" compute_metrics=compute_metrics\n",
" )\n",
" trainer.train()\n",
"\n",
"\n",
"for task_name in config.tasks:\n",
" loader_out = auto_loader.get_and_map(tokenizer_man.tokenizer, task_name)\n",
" num_labels = len(loader_out['output']['range'])\n",
" train_model(\n",
" base_model,\n",
" task_name,\n",
" loader_out['train'],\n",
" loader_out['valid'],\n",
" tokenizer_man.get_col_fn()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"root\n",
"├── bert (BertModel)\n",
"│ ├── embeddings (BertEmbeddings)\n",
"│ │ ├── word_embeddings (Embedding) weight:[30522, 768]\n",
"│ │ ├── position_embeddings (Embedding) weight:[512, 768]\n",
"│ │ ├── token_type_embeddings (Embedding) weight:[2, 768]\n",
"│ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ ├── encoder (BertEncoder)\n",
"│ │ └── layer (ModuleList)\n",
"│ │ └── 0-11(BertLayer)\n",
"│ │ ├── attention (BertAttention)\n",
"│ │ │ ├── self (BertSelfAttention)\n",
"│ │ │ │ ├── query,key,value(Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ │ └── prefix_tuning (PrefixTuningShim)\n",
"│ │ │ │ └── pool (PrefixTuningPool)\n",
"│ │ │ └── output (BertSelfOutput)\n",
"│ │ │ ├── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ │ │ └── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ │ ├── intermediate (BertIntermediate)\n",
"│ │ │ └── dense (Linear) weight:[3072, 768] bias:[3072]\n",
"│ │ └── output (BertOutput)\n",
"│ │ ├── dense (Linear) weight:[768, 3072] bias:[768]\n",
"│ │ ├── LayerNorm (LayerNorm) weight:[768] bias:[768]\n",
"│ │ └── adapters (ModuleDict)\n",
"│ │ └── glue:cola (Adapter)\n",
"│ │ ├── non_linearity (Activation_Function_Class)\n",
"│ │ ├── adapter_down (Sequential)\n",
"│ │ │ ├── 0 (Linear) weight:[48, 768] bias:[48]\n",
"│ │ │ └── 1 (Activation_Function_Class)\n",
"│ │ └── adapter_up (Linear) weight:[768, 48] bias:[768]\n",
"│ ├── pooler (BertPooler)\n",
"│ │ └── dense (Linear) weight:[768, 768] bias:[768]\n",
"│ └── prefix_tuning (PrefixTuningPool)\n",
"└── heads (ModuleDict)\n",
" └── glue:cola (ClassificationHead)\n",
" ├── 1 (Linear) weight:[768, 768] bias:[768]\n",
" ├── 2 (Activation_Function_Class)\n",
" └── 4 (Linear) weight:[2, 768] bias:[2]\n",
"
\n"
],
"text/plain": [
"\u001b[37mroot\u001b[0m\n",
"├── \u001b[37mbert \u001b[0m\u001b[32m(BertModel)\u001b[0m\n",
"│ ├── \u001b[37membeddings \u001b[0m\u001b[32m(BertEmbeddings)\u001b[0m\n",
"│ │ ├── \u001b[37mword_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[30522, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mposition_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[512, 768]\u001b[0m\n",
"│ │ ├── \u001b[37mtoken_type_embeddings \u001b[0m\u001b[32m(Embedding) \u001b[0m\u001b[38;2;0;70;100mweight:[2, 768]\u001b[0m\n",
"│ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mencoder \u001b[0m\u001b[32m(BertEncoder)\u001b[0m\n",
"│ │ └── \u001b[37mlayer \u001b[0m\u001b[32m(ModuleList)\u001b[0m\n",
"│ │ └── \u001b[31m0-11\u001b[0m\u001b[32m(BertLayer)\u001b[0m\n",
"│ │ ├── \u001b[37mattention \u001b[0m\u001b[32m(BertAttention)\u001b[0m\n",
"│ │ │ ├── \u001b[37mself \u001b[0m\u001b[32m(BertSelfAttention)\u001b[0m\n",
"│ │ │ │ ├── \u001b[31mquery,key,value\u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ │ │ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningShim)\u001b[0m\n",
"│ │ │ │ └── \u001b[37mpool \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"│ │ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertSelfOutput)\u001b[0m\n",
"│ │ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ │ └── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ ├── \u001b[37mintermediate \u001b[0m\u001b[32m(BertIntermediate)\u001b[0m\n",
"│ │ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[3072, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[3072]\u001b[0m\n",
"│ │ └── \u001b[37moutput \u001b[0m\u001b[32m(BertOutput)\u001b[0m\n",
"│ │ ├── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 3072] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ ├── \u001b[37mLayerNorm \u001b[0m\u001b[32m(LayerNorm) \u001b[0m\u001b[38;2;0;70;100mweight:[768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ │ └── \u001b[37madapters \u001b[0m\u001b[32m(ModuleDict)\u001b[0m\n",
"│ │ └── \u001b[37mglue:cola \u001b[0m\u001b[32m(Adapter)\u001b[0m\n",
"│ │ ├── \u001b[37mnon_linearity \u001b[0m\u001b[32m(Activation_Function_Class)\u001b[0m\n",
"│ │ ├── \u001b[37madapter_down \u001b[0m\u001b[32m(Sequential)\u001b[0m\n",
"│ │ │ ├── \u001b[37m0 \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[48, 768] \u001b[0m\u001b[36mbias:[48]\u001b[0m\n",
"│ │ │ └── \u001b[37m1 \u001b[0m\u001b[32m(Activation_Function_Class)\u001b[0m\n",
"│ │ └── \u001b[37madapter_up \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 48] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
"│ ├── \u001b[37mpooler \u001b[0m\u001b[32m(BertPooler)\u001b[0m\n",
"│ │ └── \u001b[37mdense \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[38;2;0;70;100mweight:[768, 768] \u001b[0m\u001b[38;2;0;70;100mbias:[768]\u001b[0m\n",
"│ └── \u001b[37mprefix_tuning \u001b[0m\u001b[32m(PrefixTuningPool)\u001b[0m\n",
"└── \u001b[37mheads \u001b[0m\u001b[32m(ModuleDict)\u001b[0m\n",
" └── \u001b[37mglue:cola \u001b[0m\u001b[32m(ClassificationHead)\u001b[0m\n",
" ├── \u001b[37m1 \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[768, 768] \u001b[0m\u001b[36mbias:[768]\u001b[0m\n",
" ├── \u001b[37m2 \u001b[0m\u001b[32m(Activation_Function_Class)\u001b[0m\n",
" └── \u001b[37m4 \u001b[0m\u001b[32m(Linear) \u001b[0m\u001b[36mweight:[2, 768] \u001b[0m\u001b[36mbias:[2]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Visualization(adapter_wrapper.model).structure_graph();"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-15T13:11:54.968862Z",
"start_time": "2023-08-15T13:11:54.946870Z"
}
},
"outputs": [],
"source": [
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-15T13:23:50.492273Z",
"start_time": "2023-08-15T13:22:40.985364Z"
}
},
"outputs": [],
"source": [
"from _datasets import GLUEHelper\n",
" \n",
"gl_helper = GLUEHelper()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-15T13:46:17.380290Z",
"start_time": "2023-08-15T13:46:17.346993Z"
}
},
"outputs": [],
"source": [
"for n in range(0, 1000):\n",
" out = gl_helper.datasets['stsb']['train'][n]\n",
" if out['label'] == 0.:\n",
" print(out)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from evaluate import load\n",
"glue_metric = load('glue', 'stsb')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = glue_metric.compute(predictions=[-0.5, -0.3], references=[-0.5, 1])\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T18:17:59.084998Z",
"start_time": "2023-08-13T18:17:59.050653Z"
}
},
"outputs": [],
"source": [
"gl_helper.datasets['mnli']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T18:17:59.157406Z",
"start_time": "2023-08-13T18:17:59.081370Z"
}
},
"outputs": [],
"source": [
"gl_helper.datasets['mnli_matched']\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T18:18:01.203910Z",
"start_time": "2023-08-13T18:18:01.171842Z"
}
},
"outputs": [],
"source": [
"gl_helper.datasets['mnli_mismatched']\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T18:30:16.905587Z",
"start_time": "2023-08-13T18:30:16.775197Z"
}
},
"outputs": [],
"source": [
"import transformers\n",
"\n",
"\n",
"print(transformers.__version__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2023-08-13T18:29:49.383120Z",
"start_time": "2023-08-13T18:29:40.017083Z"
}
},
"outputs": [],
"source": [
"pip install adapter-transformers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:lll]",
"language": "python",
"name": "conda-env-lll-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
}
},
"nbformat": 4,
"nbformat_minor": 4
}