In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

class WandBWrapper:
 def __init__(self, prefix=''):
 import wandb
 self.api = wandb.Api()
 self.prefix = prefix
 
 def get_runs(self, name):
 return self.api.runs(f"{self.prefix}{name}")
 
 def _preprocess_config(self, run):
 return {
 k: v for k,v in run.config.items()
 if not k.startswith('_')
 }
 
 def sort_valid_columns(self, cols):
 priority = {
 'matthews_correlation': 0,
 'f1': 1,
 'f1_a':1,
 'accuracy': 2,
 'exact_match': 3,
 'pearson': 5,
 'spearmanr': 6
 }
 
 for col in cols: # mnli dirty fix
 if 'matched_accuracy' in col:
 return ['valid_mean']
 
 cols = [col for col in cols if 'f1_m' not in col]
 
 stripper = lambda x: x[x.find('_') + 1:]
 return list(sorted(cols, key=lambda x: priority[stripper(x)]))
 
 def _best_in_history(self, run, key):
 history = run.history()
 all_valid_columns = [col for col in history.columns if 'valid' in col and 'mean' not in col]
 best_row_idx = history[key].astype('float').fillna(0).argmax()
 all_valid_columns = self.sort_valid_columns(all_valid_columns)
 return [max(float(history[key][best_row_idx]), 0) for key in all_valid_columns]
 
 def get_full_history(self, runs, tasks, model_size=''):
 task_names = [model_size + '_' + task_name for task_name in tasks]
 return {
 task_name: pd.DataFrame({
 run.name: run.history()['valid_mean']
 for run in self.get_runs(task_name)
 if run.name in runs
 })[runs]
 for task_name in task_names
 }
 
 def get_runs_best(self, name, run_name_filter=None):
 runs = self.get_runs(name)
 return {
 run.name: self._best_in_history(run, 'valid_mean')
 for run in runs
 if run_name_filter is None or run.name in run_name_filter
 }
 
 def get_runs_tasks_df(self, runs, tasks, model_size=''):
 task_names = [model_size + '_' + task_name for task_name in tasks]
 results = {task_name: self.get_runs_best(task_name, runs) for task_name in task_names}
 return pd.DataFrame(results).T[runs].T

In [2]:
attempt = {
 'qqp': ['-', 0.903], # F1/acc
 'qnli': [0.930],
 'mnli': [0.843],
 'sst2': [0.932],
 'stsb': [0.897, '-'], # Pearson / rho
 'mrpc': ['-', 0.857], # F1/acc
 'cola': [0.574],
 'multirc': [0.744, "-"], # F1a / EM
 'rte': [0.734],
 'cb': ["-", 0.786], # F1/acc
 'copa': '-',
 'wic': [0.668],
 'boolq': [0.788],
}
residual = {
 'qqp': "-",
 'qnli': "-",
 'mnli': "-",
 'sst2': "-",
 'stsb': "-",
 'mrpc': "-",
 'cola': "-",
 'multirc': [0.593],
 'rte': [0.704],
 'cb': [0.792],
 'copa': [0.583],
 'wic': [0.668],
 'boolq': [0.779],
}

In [3]:
import json
import numpy as np
from pathlib import Path 

def load_gpt_score(base_path, task_name):
 base_path = Path(base_path)
 if task_name == 'mnli':
 matched = json.loads((base_path / f'{task_name}_matched.json').read_text())
 mismatched = json.loads((base_path / f'{task_name}_mismatched.json').read_text())
 return [np.mean([*matched.values(), *mismatched.values()])]
 
 performance = json.loads((base_path / f'{task_name}.json').read_text())
 
 key_priority = {
 'matthews_correlation': 0,
 'f1': 1,
 'f1_a':1,
 'accuracy': 2,
 'exact_match': 3,
 'pearson': 5,
 'spearmanr': 6
 }
 
 performance_keys = list(performance.keys())
 if 'f1_m' in performance_keys:
 performance_keys.pop(performance_keys.index('f1_m'))
 performance_keys.sort(key=lambda x: key_priority[x])
 
 return [float(performance[key]) for key in performance_keys]

tasks = [
 'qqp', # new datasets
 'qnli', # new datasets
 'mnli', # new datasets
 'sst2', # new datasets
 'stsb', # new datasets
 'mrpc',
 'cola',
 'multirc', # new datasets
 'rte',
 'cb',
 'copa',
 'wic',
 'boolq',
]

gpt_performances = {task: load_gpt_score('openai', task) for task in tasks}

In [4]:
tasks = {
 # 'glue-wnli',
 # 'glue-rte',
 'glue-qqp': 'qqp', # new datasets
 'glue-qnli': 'qnli', # new datasets
 'glue-mnli': 'mnli', # new datasets
 'glue-sst2': 'sst2', # new datasets
 'glue-stsb': 'stsb', # new datasets
 'glue-mrpc': 'mrpc',
 'glue-cola': 'cola',
 'superglue-multirc': 'multirc', # new datasets
 'superglue-rte': 'rte',
 'superglue-cb': 'cb',
 'superglue-copa': 'copa',
 'superglue-wic': 'wic',
 'superglue-boolq': 'boolq',
}

runs = [
 '10_combine_128',
] 

base_lmt5_df = WandBWrapper("mohalisad/hzi_cluster_t5_").get_runs_tasks_df(
 runs=runs, tasks=tasks.keys(), model_size='base'
)
base_lmt5_df['base_superglue-cb']['10_combine_128'] = [0.7826, 0.8214]
small_lmt5_df = WandBWrapper("mohalisad/hzi_cluster_t5_").get_runs_tasks_df(
 runs=runs,
 tasks=tasks.keys(),
 model_size='small'
)
small_lmt5_softmax_df = WandBWrapper("mohalisad/iclr_softmax_effect_t5_").get_runs_tasks_df(
 runs=runs,
 tasks=tasks.keys(),
 model_size='small'
)
base_origt5_df = WandBWrapper("iclr_orig_t5_t5_").get_runs_tasks_df(
 runs=runs, tasks=tasks, model_size='base'
)

In [5]:
base_lmt5_df.columns = tasks.values()
small_lmt5_df.columns = tasks.values()
small_lmt5_softmax_df.columns = tasks.values()
base_origt5_df.columns = tasks.values()

attempt_df = pd.Series(attempt).to_frame().T
residual_df = pd.Series(residual).to_frame().T
gpt_df = pd.Series(gpt_performances).to_frame().T

In [6]:
def my_concat(**kwargs):
 merged_df = pd.concat(
 list(kwargs.values()),
 ignore_index=True
 )
 merged_df['name'] = list(kwargs.keys())
 merged_df.set_index('name', inplace=True)
 return merged_df

comp_orig_df = my_concat(
 superpos=base_origt5_df,
 attempt=attempt_df,
 residual=residual_df
)
comp_softmax_df = my_concat(
 superpos=small_lmt5_df,
 superpos_softmax=small_lmt5_softmax_df,
)
comb_base_df = my_concat(
 superpos=base_lmt5_df
)
comp_gpt_df = my_concat(
 gpt=gpt_df
)

In [14]:
import numpy as np
import itertools

def _tblr_args(rows_count_seq):
 top_rows = list(np.cumsum([4, *rows_count_seq]))
 top_rows_str = ', '.join(map(str, top_rows[:-1]))
 bold_line = ', '.join(map(str, top_rows))
 return r"""column{2-18} = {c},
 cell{1}{2, 3, 4} = {r=3}{b},
 cell{1}{5} = {c=7}{c},
 cell{1}{12} = {c=6}{},
 vline{2, 3, 4, 5,12,18} = {1-3}{},
 hline{2} = {4-17}{},
 row{%s} = {c},
 cell{%s}{1} = {c=18}{},
 hline{%s} = {-}{2px},,""" % (top_rows_str, top_rows_str, bold_line)

def _head_rows():
 return [
 r"&\rot{\eztb{\# Prompts}} & \rot{\eztb{Softmax}} & \rot{\eztb{Dropout}} & GLUE &&&&&&& SuperGLUE &&&&&&",
 r"Task→ &&&& QQP & QNLI & MNLI & SST-2 & STS-B & MRPC & CoLA & MultiRC & RTE & CB & COPA & WiC & BoolQ & Avg.",
 r"Method↓ &&&& F1/Acc. & Acc. & Acc. & Acc. & PCC/$\rho$ & F1/Acc. & MCC & F1a/EM & Acc. & F1/Acc. & Acc. & Acc. & Acc. & -"
 ]

def _section_row(name):
 return name

def to_pure_number(item):
 if isinstance(item, list):
 item = [x for x in item if x != '-']
 if len(item) == 0:
 return '-'
 return sum(item) / len(item)
 return item

def to_pure_numbers(numbers):
 return np.array([
 to_pure_number(list_item)
 for list_item in numbers
 ])

def _convert_single_number(single_number):
 if single_number == '-':
 return '-'
 if isinstance(single_number, str):
 print(single_number)
 return f"{100 * single_number:.1f}"

def _convert_number(n):
 if not isinstance(n, list):
 n = [n]
 number_str = "/".join([_convert_single_number(n_item) for n_item in n])
 if to_pure_number(n) == 0:
 return f'{number_str} $\\dag$'
 return number_str

def _get_mark(mark_bool):
 if mark_bool is None:
 return ""
 return "\\cmark" if mark_bool else "\\xmark"

def _normal_row(name, prompt_count, is_softmax, is_dropout, numbers, bold_mask=None):
 numbers_str = [_convert_number(n) for n in numbers]
 if bold_mask is not None:
 for idx, bold_state in enumerate(bold_mask):
 if bold_state:
 numbers_str[idx] = "\\textbf{" + numbers_str[idx] + "}"
 
 prompt_count = str(prompt_count) if prompt_count is not None else ""
 return " & ".join([name, prompt_count, _get_mark(is_softmax), _get_mark(is_dropout), *numbers_str])

def _compute_mean(numbers):
 return np.array([[
 '-'
 if '-' in list(row)
 else to_pure_numbers(row).mean()
 for row in numbers
 ]], dtype=object).T

def generate_rows(names, prompt_counts, softmaxes, dropouts, numbers, first_row_bold=False):
 mean = _compute_mean(numbers)
 numbers = np.concatenate((numbers, mean), axis=1)
 
 if first_row_bold:
 mask = np.zeros_like(numbers)
 mask[0, :] = 1
 mask = mask.astype(bool)
 args_zip = zip(names, prompt_counts, softmaxes, dropouts, numbers, mask)
 else:
 args_zip = zip(names, prompt_counts, softmaxes, dropouts, numbers)
 
 rows = [
 _normal_row(*args)
 for args in args_zip
 ]
 return rows
 
def generate_table(input_dict):
 all_rows = [(_section_row(key), *val) for (key, val) in input_dict.items()]
 rows_count_seq = [len(row) for row in all_rows]
 all_rows_flatten = itertools.chain.from_iterable(all_rows)
 end_line = '\\\\\n'
 rows = [
 *_head_rows(),
 *all_rows_flatten
 ]
 return r"""\begin{tblr}{
 %s
}
%s
\end{tblr}
""" % (_tblr_args(rows_count_seq), end_line.join(rows + [""]))

In [15]:
comp_orig_rows = generate_rows(
 names=['SuperPos PT', 'ATTEMPT $\star$', 'Residual PT $\star$'],
 prompt_counts=[10, 100, 10],
 softmaxes=[False, True, False],
 dropouts=[False, True, True],
 numbers=comp_orig_df.to_numpy(),
 first_row_bold=True
)
comp_softmax_rows = generate_rows(
 names=['SuperPos PT', 'SuperPos PT'],
 prompt_counts=[10, 10],
 softmaxes=[False, True],
 dropouts=[False, False],
 numbers=comp_softmax_df.to_numpy(),
 first_row_bold=True
)
comb_base_rows = generate_rows(
 names=['SuperPos PT'],
 prompt_counts=[10],
 softmaxes=[False],
 dropouts=[False],
 numbers=comb_base_df.to_numpy()
)
comp_gpt_rows = generate_rows(
 names=['1 Shot'],
 prompt_counts=[None],
 softmaxes=[None],
 dropouts=[None],
 numbers=comp_gpt_df.to_numpy()
)


print(generate_table({
 'T5 Base': comp_orig_rows,
 'T5v1.1 Small LM-Adapted': comp_softmax_rows,
 'T5v1.1 Base LM-Adapted': comb_base_rows,
 'GPT-3.5-Turbo': comp_gpt_rows
}))

\begin{tblr}{
 column{2-18} = {c},
 cell{1}{2, 3, 4} = {r=3}{b},
 cell{1}{5} = {c=7}{c},
 cell{1}{12} = {c=6}{},
 vline{2, 3, 4, 5,12,18} = {1-3}{},
 hline{2} = {4-17}{},
 row{4, 8, 11, 13} = {c},
 cell{4, 8, 11, 13}{1} = {c=18}{},
 hline{4, 8, 11, 13, 15} = {-}{2px},,
}
&\rot{\eztb{\# Prompts}} & \rot{\eztb{Softmax}} & \rot{\eztb{Dropout}} & GLUE &&&&&&& SuperGLUE &&&&&&\\
Task→ &&&& QQP & QNLI & MNLI & SST-2 & STS-B & MRPC & CoLA & MultiRC & RTE & CB & COPA & WiC & BoolQ & Avg.\\
Method↓ &&&& F1/Acc. & Acc. & Acc. & Acc. & PCC/$\rho$ & F1/Acc. & MCC & F1a/EM & Acc. & F1/Acc. & Acc. & Acc. & Acc. & -\\
T5 Base\\
SuperPos PT & 10 & \xmark & \xmark & \textbf{87.8/90.8} & \textbf{93.5} & \textbf{86.0} & \textbf{94.4} & \textbf{90.2/90.1} & \textbf{92.4/89.5} & \textbf{59.7} & \textbf{77.7/40.9} & \textbf{80.1} & \textbf{97.4/96.4} & \textbf{66.0} & \textbf{67.6} & \textbf{81.3} & \textbf{81.2}\\
ATTEMPT $\star$ & 100 & \cmark & \cmark & -/90.3 & 93.0 & 84.3 & 93.2 & 89.7/- & -/85.7 & 57.

In [9]:
base_df.to_numpy()

NameError: name 'base_df' is not defined

In [None]:
import pandas as pd

In [None]:
pd.DataFrame({'a': [1, 2., '-'], 'b': [0, 5, 1]}).to_numpy()[0].mean()