Browse Source

prepare for hyper parameter tuning

define_task
mohamad maheri 2 years ago
parent
commit
d2d5563db1
5 changed files with 99 additions and 76 deletions
  1. 15
    8
      clustering.py
  2. 25
    14
      hyper_main.py
  3. 13
    13
      hyper_testing.py
  4. 43
    38
      hyper_tunning.py
  5. 3
    3
      learnToLearn.py

+ 15
- 8
clustering.py View File

class ClustringModule(torch.nn.Module): class ClustringModule(torch.nn.Module):
def __init__(self, config): def __init__(self, config):
super(ClustringModule, self).__init__() super(ClustringModule, self).__init__()
self.h1_dim = 128
self.h2_dim = 64
# self.h1_dim = 128
self.h1_dim = config['cluster_h1_dim']
# self.h2_dim = 64
self.h2_dim = config['cluster_h2_dim']
# self.final_dim = fc1_in_dim # self.final_dim = fc1_in_dim
self.final_dim = 64
self.dropout_rate = 0
# self.final_dim = 64
self.final_dim = config['cluster_final_dim']
# self.dropout_rate = 0
self.dropout_rate = config['cluster_dropout_rate']


layers = [nn.Linear(config['embedding_dim'] * 8 + 1, self.h1_dim), layers = [nn.Linear(config['embedding_dim'] * 8 + 1, self.h1_dim),
torch.nn.Dropout(self.dropout_rate), torch.nn.Dropout(self.dropout_rate),
nn.Linear(self.h2_dim, self.final_dim)] nn.Linear(self.h2_dim, self.final_dim)]
self.input_to_hidden = nn.Sequential(*layers) self.input_to_hidden = nn.Sequential(*layers)


self.clusters_k = 7
# self.clusters_k = 7
self.clusters_k = config['cluster_k']
self.embed_size = self.final_dim self.embed_size = self.final_dim
self.array = nn.Parameter(init.xavier_uniform_(torch.FloatTensor(self.clusters_k, self.embed_size))) self.array = nn.Parameter(init.xavier_uniform_(torch.FloatTensor(self.clusters_k, self.embed_size)))
self.temperature = 1.0
# self.temperature = 1.0
self.temperature = config['temperature']


def aggregate(self, z_i): def aggregate(self, z_i):
return torch.mean(z_i, dim=0) return torch.mean(z_i, dim=0)
# cluster module # cluster module
self.cluster_module = ClustringModule(config) self.cluster_module = ClustringModule(config)
# self.task_dim = fc1_in_dim # self.task_dim = fc1_in_dim
self.task_dim = 64
self.task_dim = config['cluster_final_dim']
# transform task to weights # transform task to weights
self.film_layer_1_beta = nn.Linear(self.task_dim, fc2_in_dim, bias=False) self.film_layer_1_beta = nn.Linear(self.task_dim, fc2_in_dim, bias=False)
self.film_layer_1_gamma = nn.Linear(self.task_dim, fc2_in_dim, bias=False) self.film_layer_1_gamma = nn.Linear(self.task_dim, fc2_in_dim, bias=False)
self.film_layer_2_gamma = nn.Linear(self.task_dim, fc2_out_dim, bias=False) self.film_layer_2_gamma = nn.Linear(self.task_dim, fc2_out_dim, bias=False)
# self.film_layer_3_beta = nn.Linear(self.task_dim, self.h3_dim, bias=False) # self.film_layer_3_beta = nn.Linear(self.task_dim, self.h3_dim, bias=False)
# self.film_layer_3_gamma = nn.Linear(self.task_dim, self.h3_dim, bias=False) # self.film_layer_3_gamma = nn.Linear(self.task_dim, self.h3_dim, bias=False)
self.dropout_rate = 0
# self.dropout_rate = 0
self.dropout_rate = config['trainer_dropout_rate']
self.dropout = nn.Dropout(self.dropout_rate) self.dropout = nn.Dropout(self.dropout_rate)


def aggregate(self, z_i): def aggregate(self, z_i):

+ 25
- 14
hyper_main.py View File





def main(num_samples, max_num_epochs=20, gpus_per_trial=2): def main(num_samples, max_num_epochs=20, gpus_per_trial=2):
data_dir = os.path.abspath("/media/external_10TB/10TB/maheri/melu_data5")
data_dir = os.path.abspath("/media/external_10TB/10TB/maheri/define_task_melu_data")
load_data(data_dir) load_data(data_dir)
config = { config = {
# "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
# "lr": tune.loguniform(1e-4, 1e-1), # "lr": tune.loguniform(1e-4, 1e-1),
# "batch_size": tune.choice([2, 4, 8, 16]) # "batch_size": tune.choice([2, 4, 8, 16])
"transformer": tune.choice(['kronoker']), "transformer": tune.choice(['kronoker']),
"meta_algo":tune.choice(['gbml']),
"first_order":tune.choice([False]),
"adapt_transform":tune.choice([True,False]),
"meta_algo": tune.choice(['gbml']),
"first_order": tune.choice([False]),
"adapt_transform": tune.choice([True, False]),
# "local_lr":tune.choice([5e-6,5e-4,5e-3]), # "local_lr":tune.choice([5e-6,5e-4,5e-3]),
# "lr":tune.choice([5e-5,5e-4]), # "lr":tune.choice([5e-5,5e-4]),
"local_lr":tune.loguniform(5e-6,5e-3),
"lr":tune.loguniform(5e-5,5e-3),
"batch_size":tune.choice([16,32,64]),
"inner":tune.choice([7,5,4,3,1]),
"test_state":tune.choice(["user_and_item_cold_state"]),
# "epochs":tune.choice([5,10,20,25]),
"local_lr": tune.loguniform(5e-6, 5e-3),
"lr": tune.loguniform(5e-5, 5e-3),
"batch_size": tune.choice([16, 32, 64]),
"inner": tune.choice([7, 5, 4, 3, 1]),
"test_state": tune.choice(["user_and_item_cold_state"]),

"embedding_dim": tune.choice([16, 32, 64]),
"first_fc_hidden_dim": tune.choice([32, 64, 128]),
"second_fc_hidden_dim": tune.choice([32, 64]),

'cluster_h1_dim': tune.choice([256, 128, 64]),
'cluster_h2_dim': tune.choice([128, 64, 32]),
'cluster_final_dim': tune.choice([64, 32]),
'cluster_dropout_rate': tune.choice([0, 0.01, 0.1]),
'cluster_k': tune.choice([3, 5, 7, 9, 11]),
'temperature': tune.choice([0.1, 0.5, 1.0, 2.0, 10.0]),
'trainer_dropout_rate': tune.choice([0, 0.01, 0.1]),
} }


scheduler = ASHAScheduler( scheduler = ASHAScheduler(
metric="loss", metric="loss",
mode="min", mode="min",
max_t=30, max_t=30,
grace_period=6,
grace_period=10,
reduction_factor=2) reduction_factor=2)
reporter = CLIReporter( reporter = CLIReporter(
# parameter_columns=["l1", "l2", "lr", "batch_size"], # parameter_columns=["l1", "l2", "lr", "batch_size"],
metric_columns=["loss", "ndcg1","ndcg3", "training_iteration"])
metric_columns=["loss", "ndcg1", "ndcg3", "training_iteration"])
result = tune.run( result = tune.run(
partial(train_melu, data_dir=data_dir), partial(train_melu, data_dir=data_dir),
resources_per_trial={"cpu": 4, "gpu": gpus_per_trial}, resources_per_trial={"cpu": 4, "gpu": gpus_per_trial},
log_to_file=True, log_to_file=True,
# resume=True, # resume=True,
local_dir="./hyper_tunning_all_cold", local_dir="./hyper_tunning_all_cold",
name="melu_all_cold",
name="melu_all_cold_clustered",


) )




if __name__ == "__main__": if __name__ == "__main__":
# You can change the number of GPUs per trial here: # You can change the number of GPUs per trial here:
main(num_samples=150, max_num_epochs=25, gpus_per_trial=1)
main(num_samples=150, max_num_epochs=25, gpus_per_trial=1)

+ 13
- 13
hyper_testing.py View File

from sklearn.metrics import ndcg_score from sklearn.metrics import ndcg_score




def hyper_test(embedding,head, total_dataset, adaptation_step):

def hyper_test(embedding, head, total_dataset, adaptation_step):
test_set_size = len(total_dataset) test_set_size = len(total_dataset)
random.shuffle(total_dataset) random.shuffle(total_dataset)
a, b, c, d = zip(*total_dataset) a, b, c, d = zip(*total_dataset)
losses_q = [] losses_q = []
ndcgs11 = [] ndcgs11 = []
ndcgs33=[]
ndcgs33 = []

head.eval()


for iterator in range(test_set_size): for iterator in range(test_set_size):


temp_sxs = embedding(supp_xs) temp_sxs = embedding(supp_xs)
temp_qxs = embedding(query_xs) temp_qxs = embedding(query_xs)


evaluation_error,predictions = fast_adapt(learner,
temp_sxs,
temp_qxs,
supp_ys,
query_ys,
adaptation_step,
get_predictions=True)
evaluation_error, predictions = fast_adapt(learner,
temp_sxs,
temp_qxs,
supp_ys,
query_ys,
adaptation_step,
get_predictions=True)


l1 = L1Loss(reduction='mean') l1 = L1Loss(reduction='mean')
loss_q = l1(predictions.view(-1), query_ys) loss_q = l1(predictions.view(-1), query_ys)
ndcg1 = 0 ndcg1 = 0
ndcg3 = 0 ndcg3 = 0


return losses_q,ndcg1,ndcg3


head.train()
return losses_q, ndcg1, ndcg3

+ 43
- 38
hyper_tunning.py View File

import gc import gc
from learn2learn.optim.transforms import KroneckerTransform from learn2learn.optim.transforms import KroneckerTransform
from hyper_testing import hyper_test from hyper_testing import hyper_test
from clustering import Trainer


# Define paths (for data)
master_path= "/media/external_10TB/10TB/maheri/melu_data5"


def load_data(data_dir=master_path,test_state='warm_state'):
# Define paths (for data)
# master_path= "/media/external_10TB/10TB/maheri/melu_data5"


def load_data(data_dir=None, test_state='warm_state'):
training_set_size = int(len(os.listdir("{}/warm_state".format(data_dir))) / 4) training_set_size = int(len(os.listdir("{}/warm_state".format(data_dir))) / 4)
supp_xs_s = [] supp_xs_s = []
supp_ys_s = [] supp_ys_s = []
del (supp_xs_s, supp_ys_s, query_xs_s, query_ys_s) del (supp_xs_s, supp_ys_s, query_xs_s, query_ys_s)


random.shuffle(test_dataset) random.shuffle(test_dataset)
random.shuffle(trainset)
val_size = int(test_set_size * 0.2) val_size = int(test_set_size * 0.2)
validationset = test_dataset[:val_size] validationset = test_dataset[:val_size]
testset = test_dataset[val_size:] testset = test_dataset[val_size:]


return trainset, validationset,testset
return trainset, validationset, testset




def train_melu(conf, checkpoint_dir=None, data_dir=None): def train_melu(conf, checkpoint_dir=None, data_dir=None):
embedding_dim = config['embedding_dim']
print("inajm1:",checkpoint_dir)
fc1_in_dim = config['embedding_dim'] * 8
fc2_in_dim = config['first_fc_hidden_dim']
fc2_out_dim = config['second_fc_hidden_dim']
print("inajm1:", checkpoint_dir)
embedding_dim = conf['embedding_dim']
fc1_in_dim = conf['embedding_dim'] * 8
fc2_in_dim = conf['first_fc_hidden_dim']
fc2_out_dim = conf['second_fc_hidden_dim']


fc1 = torch.nn.Linear(fc1_in_dim, fc2_in_dim)
fc2 = torch.nn.Linear(fc2_in_dim, fc2_out_dim)
linear_out = torch.nn.Linear(fc2_out_dim, 1)
head = torch.nn.Sequential(fc1, fc2, linear_out)
# fc1 = torch.nn.Linear(fc1_in_dim, fc2_in_dim)
# fc2 = torch.nn.Linear(fc2_in_dim, fc2_out_dim)
# linear_out = torch.nn.Linear(fc2_out_dim, 1)
# head = torch.nn.Sequential(fc1, fc2, linear_out)


emb = EmbeddingModule(config).cuda() emb = EmbeddingModule(config).cuda()


elif conf['transformer'] == "linear": elif conf['transformer'] == "linear":
transform = l2l.optim.ModuleTransform(torch.nn.Linear) transform = l2l.optim.ModuleTransform(torch.nn.Linear)


trainer = Trainer(config)

# define meta algorithm # define meta algorithm
if conf['meta_algo'] == "maml": if conf['meta_algo'] == "maml":
head = l2l.algorithms.MAML(head, lr=conf['local_lr'], first_order=conf['first_order'])
trainer = l2l.algorithms.MAML(trainer, lr=conf['local_lr'], first_order=conf['first_order'])
elif conf['meta_algo'] == 'metasgd': elif conf['meta_algo'] == 'metasgd':
head = l2l.algorithms.MetaSGD(head, lr=conf['local_lr'], first_order=conf['first_order'])
trainer = l2l.algorithms.MetaSGD(trainer, lr=conf['local_lr'], first_order=conf['first_order'])
elif conf['meta_algo'] == 'gbml': elif conf['meta_algo'] == 'gbml':
head = l2l.algorithms.GBML(head, transform=transform, lr=conf['local_lr'],
trainer = l2l.algorithms.GBML(trainer, transform=transform, lr=conf['local_lr'],
adapt_transform=conf['adapt_transform'], first_order=conf['first_order']) adapt_transform=conf['adapt_transform'], first_order=conf['first_order'])
head.cuda()
net = nn.Sequential(emb,head)
trainer.cuda()
# net = nn.Sequential(emb, head)


criterion = nn.MSELoss() criterion = nn.MSELoss()
all_parameters = list(emb.parameters()) + list(head.parameters())
all_parameters = list(emb.parameters()) + list(trainer.parameters())
optimizer = torch.optim.Adam(all_parameters, lr=conf['lr']) optimizer = torch.optim.Adam(all_parameters, lr=conf['lr'])


if checkpoint_dir: if checkpoint_dir:
model_state, optimizer_state = torch.load(
os.path.join(checkpoint_dir, "checkpoint"))
net.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
print("in checkpoint - bug happened")
# model_state, optimizer_state = torch.load(
# os.path.join(checkpoint_dir, "checkpoint"))
# net.load_state_dict(model_state)
# optimizer.load_state_dict(optimizer_state)


# loading data # loading data
train_dataset,validation_dataset,test_dataset = load_data(data_dir,test_state=conf['test_state'])
print(conf['test_state'])
train_dataset, validation_dataset, test_dataset = load_data(data_dir, test_state=conf['test_state'])
batch_size = conf['batch_size'] batch_size = conf['batch_size']
num_batch = int(len(train_dataset) / batch_size) num_batch = int(len(train_dataset) / batch_size)

a, b, c, d = zip(*train_dataset) a, b, c, d = zip(*train_dataset)


for epoch in range(config['num_epoch']): # loop over the dataset multiple times for epoch in range(config['num_epoch']): # loop over the dataset multiple times
sys = supp_ys[task].cuda() sys = supp_ys[task].cuda()
qys = query_ys[task].cuda() qys = query_ys[task].cuda()


learner = head.clone()
learner = trainer.clone()
temp_sxs = emb(sxs) temp_sxs = emb(sxs)
temp_qxs = emb(qxs) temp_qxs = emb(qxs)


evaluation_error = fast_adapt(learner, evaluation_error = fast_adapt(learner,
temp_sxs,
temp_qxs,
sys,
qys,
conf['inner'])
temp_sxs,
temp_qxs,
sys,
qys,
conf['inner'])


evaluation_error.backward() evaluation_error.backward()
meta_train_error += evaluation_error.item() meta_train_error += evaluation_error.item()


del(sxs,qxs,sys,qys)
del (sxs, qxs, sys, qys)
supp_xs[task].cpu() supp_xs[task].cpu()
query_xs[task].cpu() query_xs[task].cpu()
supp_ys[task].cpu() supp_ys[task].cpu()
gc.collect() gc.collect()


# test results on the validation data # test results on the validation data
val_loss,val_ndcg1,val_ndcg3 = hyper_test(emb,head,validation_dataset,adaptation_step=conf['inner'])
val_loss, val_ndcg1, val_ndcg3 = hyper_test(emb, trainer, validation_dataset, adaptation_step=conf['inner'])


with tune.checkpoint_dir(epoch) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((net.state_dict(), optimizer.state_dict()), path)
# with tune.checkpoint_dir(epoch) as checkpoint_dir:
# path = os.path.join(checkpoint_dir, "checkpoint")
# torch.save((net.state_dict(), optimizer.state_dict()), path)


tune.report(loss=val_loss, ndcg1=val_ndcg1,ndcg3=val_ndcg3)
print("Finished Training")
tune.report(loss=val_loss, ndcg1=val_ndcg1, ndcg3=val_ndcg3)
print("Finished Training")

+ 3
- 3
learnToLearn.py View File

print("start of test phase") print("start of test phase")
trainer.eval() trainer.eval()


with open("results.txt", "a") as f:
with open("results2.txt", "a") as f:
f.write("epoch:{}\n".format(iteration)) f.write("epoch:{}\n".format(iteration))


for test_state in ['user_cold_state', 'item_cold_state', 'user_and_item_cold_state']: for test_state in ['user_cold_state', 'item_cold_state', 'user_and_item_cold_state']:


print("===================== " + test_state + " =====================") print("===================== " + test_state + " =====================")
mse, ndc1, ndc3 = test(emb, trainer, test_dataset, batch_size=config['batch_size'],num_epoch=config['num_epoch'],test_state=test_state,args=args) mse, ndc1, ndc3 = test(emb, trainer, test_dataset, batch_size=config['batch_size'],num_epoch=config['num_epoch'],test_state=test_state,args=args)
with open("results.txt", "a") as f:
with open("results2.txt", "a") as f:
f.write("{}\t{}\t{}\n".format(mse, ndc1, ndc3)) f.write("{}\t{}\t{}\n".format(mse, ndc1, ndc3))
print("===================================================") print("===================================================")
del (test_dataset) del (test_dataset)
gc.collect() gc.collect()


trainer.train() trainer.train()
with open("results.txt", "a") as f:
with open("results2.txt", "a") as f:
f.write("\n") f.write("\n")
print("\n\n\n") print("\n\n\n")



Loading…
Cancel
Save