class ClustringModule(torch.nn.Module): | class ClustringModule(torch.nn.Module): | ||||
def __init__(self, config): | |||||
def __init__(self, config_param): | |||||
super(ClustringModule, self).__init__() | super(ClustringModule, self).__init__() | ||||
# self.h1_dim = 128 | |||||
self.h1_dim = config['cluster_h1_dim'] | |||||
# self.h2_dim = 64 | |||||
self.h2_dim = config['cluster_h2_dim'] | |||||
# self.final_dim = fc1_in_dim | |||||
# self.final_dim = 64 | |||||
self.final_dim = config['cluster_final_dim'] | |||||
# self.dropout_rate = 0 | |||||
self.dropout_rate = config['cluster_dropout_rate'] | |||||
self.h1_dim = config_param['cluster_h1_dim'] | |||||
self.h2_dim = config_param['cluster_h2_dim'] | |||||
self.final_dim = config_param['cluster_final_dim'] | |||||
self.dropout_rate = config_param['cluster_dropout_rate'] | |||||
layers = [nn.Linear(config['embedding_dim'] * 8 + 1, self.h1_dim), | |||||
layers = [nn.Linear(config_param['embedding_dim'] * 8 + 1, self.h1_dim), | |||||
torch.nn.Dropout(self.dropout_rate), | torch.nn.Dropout(self.dropout_rate), | ||||
nn.ReLU(inplace=True), | nn.ReLU(inplace=True), | ||||
# nn.BatchNorm1d(self.h1_dim), | # nn.BatchNorm1d(self.h1_dim), | ||||
nn.Linear(self.h2_dim, self.final_dim)] | nn.Linear(self.h2_dim, self.final_dim)] | ||||
self.input_to_hidden = nn.Sequential(*layers) | self.input_to_hidden = nn.Sequential(*layers) | ||||
# self.clusters_k = 7 | |||||
self.clusters_k = config['cluster_k'] | |||||
self.clusters_k = config_param['cluster_k'] | |||||
self.embed_size = self.final_dim | self.embed_size = self.final_dim | ||||
self.array = nn.Parameter(init.xavier_uniform_(torch.FloatTensor(self.clusters_k, self.embed_size))) | self.array = nn.Parameter(init.xavier_uniform_(torch.FloatTensor(self.clusters_k, self.embed_size))) | ||||
# self.temperature = 1.0 | |||||
self.temperature = config['temperature'] | |||||
self.temperature = config_param['temperature'] | |||||
def aggregate(self, z_i): | def aggregate(self, z_i): | ||||
return torch.mean(z_i, dim=0) | return torch.mean(z_i, dim=0) | ||||
# todo : may be useless | # todo : may be useless | ||||
mean_task = self.aggregate(task_embed) | mean_task = self.aggregate(task_embed) | ||||
# C_distribution, new_task_embed = self.memoryunit(mean_task) | |||||
res = torch.norm(mean_task - self.array, p=2, dim=1, keepdim=True) | res = torch.norm(mean_task - self.array, p=2, dim=1, keepdim=True) | ||||
res = torch.pow((res / self.temperature) + 1, (self.temperature + 1) / -2) | res = torch.pow((res / self.temperature) + 1, (self.temperature + 1) / -2) | ||||
# 1*k | # 1*k | ||||
value = torch.mm(C, self.array) | value = torch.mm(C, self.array) | ||||
# simple add operation | # simple add operation | ||||
new_task_embed = value + mean_task | new_task_embed = value + mean_task | ||||
# calculate target distribution | |||||
return C, new_task_embed | return C, new_task_embed | ||||
class Trainer(torch.nn.Module): | class Trainer(torch.nn.Module): | ||||
def __init__(self, config, head=None): | |||||
def __init__(self, config_param, head=None): | |||||
super(Trainer, self).__init__() | super(Trainer, self).__init__() | ||||
fc1_in_dim = config['embedding_dim'] * 8 | |||||
fc2_in_dim = config['first_fc_hidden_dim'] | |||||
fc2_out_dim = config['second_fc_hidden_dim'] | |||||
fc1_in_dim = config_param['embedding_dim'] * 8 | |||||
fc2_in_dim = config_param['first_fc_hidden_dim'] | |||||
fc2_out_dim = config_param['second_fc_hidden_dim'] | |||||
self.fc1 = torch.nn.Linear(fc1_in_dim, fc2_in_dim) | self.fc1 = torch.nn.Linear(fc1_in_dim, fc2_in_dim) | ||||
self.fc2 = torch.nn.Linear(fc2_in_dim, fc2_out_dim) | self.fc2 = torch.nn.Linear(fc2_in_dim, fc2_out_dim) | ||||
self.linear_out = torch.nn.Linear(fc2_out_dim, 1) | self.linear_out = torch.nn.Linear(fc2_out_dim, 1) | ||||
# cluster module | # cluster module | ||||
self.cluster_module = ClustringModule(config) | |||||
self.cluster_module = ClustringModule(config_param) | |||||
# self.task_dim = fc1_in_dim | # self.task_dim = fc1_in_dim | ||||
self.task_dim = config['cluster_final_dim'] | |||||
self.task_dim = config_param['cluster_final_dim'] | |||||
# transform task to weights | # transform task to weights | ||||
self.film_layer_1_beta = nn.Linear(self.task_dim, fc2_in_dim, bias=False) | self.film_layer_1_beta = nn.Linear(self.task_dim, fc2_in_dim, bias=False) | ||||
self.film_layer_1_gamma = nn.Linear(self.task_dim, fc2_in_dim, bias=False) | self.film_layer_1_gamma = nn.Linear(self.task_dim, fc2_in_dim, bias=False) | ||||
# self.film_layer_3_beta = nn.Linear(self.task_dim, self.h3_dim, bias=False) | # self.film_layer_3_beta = nn.Linear(self.task_dim, self.h3_dim, bias=False) | ||||
# self.film_layer_3_gamma = nn.Linear(self.task_dim, self.h3_dim, bias=False) | # self.film_layer_3_gamma = nn.Linear(self.task_dim, self.h3_dim, bias=False) | ||||
# self.dropout_rate = 0 | # self.dropout_rate = 0 | ||||
self.dropout_rate = config['trainer_dropout_rate'] | |||||
self.dropout_rate = config_param['trainer_dropout_rate'] | |||||
self.dropout = nn.Dropout(self.dropout_rate) | self.dropout = nn.Dropout(self.dropout_rate) | ||||
def aggregate(self, z_i): | def aggregate(self, z_i): |
def main(num_samples, max_num_epochs=20, gpus_per_trial=2): | def main(num_samples, max_num_epochs=20, gpus_per_trial=2): | ||||
data_dir = os.path.abspath("/media/external_10TB/10TB/maheri/define_task_melu_data") | |||||
load_data(data_dir) | |||||
data_dir = os.path.abspath("/media/external_10TB/10TB/maheri/new_data_dir3") | |||||
config = { | config = { | ||||
# "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), | # "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), | ||||
# "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), | # "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), | ||||
# "lr": tune.loguniform(1e-4, 1e-1), | # "lr": tune.loguniform(1e-4, 1e-1), | ||||
# "batch_size": tune.choice([2, 4, 8, 16]) | # "batch_size": tune.choice([2, 4, 8, 16]) | ||||
"transformer": tune.choice(['kronoker']), | "transformer": tune.choice(['kronoker']), | ||||
"meta_algo": tune.choice(['gbml']), | |||||
"meta_algo": tune.choice(['gbml', 'metasgd']), | |||||
"first_order": tune.choice([False]), | "first_order": tune.choice([False]), | ||||
"adapt_transform": tune.choice([True, False]), | "adapt_transform": tune.choice([True, False]), | ||||
# "local_lr":tune.choice([5e-6,5e-4,5e-3]), | # "local_lr":tune.choice([5e-6,5e-4,5e-3]), | ||||
"local_lr": tune.loguniform(5e-6, 5e-3), | "local_lr": tune.loguniform(5e-6, 5e-3), | ||||
"lr": tune.loguniform(5e-5, 5e-3), | "lr": tune.loguniform(5e-5, 5e-3), | ||||
"batch_size": tune.choice([16, 32, 64]), | "batch_size": tune.choice([16, 32, 64]), | ||||
"inner": tune.choice([7, 5, 4, 3, 1]), | |||||
"inner": tune.choice([1, 3, 5, 7]), | |||||
"test_state": tune.choice(["user_and_item_cold_state"]), | "test_state": tune.choice(["user_and_item_cold_state"]), | ||||
"embedding_dim": tune.choice([16, 32, 64]), | "embedding_dim": tune.choice([16, 32, 64]), | ||||
'cluster_final_dim': tune.choice([64, 32]), | 'cluster_final_dim': tune.choice([64, 32]), | ||||
'cluster_dropout_rate': tune.choice([0, 0.01, 0.1]), | 'cluster_dropout_rate': tune.choice([0, 0.01, 0.1]), | ||||
'cluster_k': tune.choice([3, 5, 7, 9, 11]), | 'cluster_k': tune.choice([3, 5, 7, 9, 11]), | ||||
'temperature': tune.choice([0.1, 0.5, 1.0, 2.0, 10.0]), | |||||
'temperature': tune.choice([0.001, 0.1, 0.5, 1.0, 2.0, 10.0]), | |||||
'trainer_dropout_rate': tune.choice([0, 0.01, 0.1]), | 'trainer_dropout_rate': tune.choice([0, 0.01, 0.1]), | ||||
'use_cuda': tune.choice([True]), | |||||
# item | |||||
'num_rate': tune.choice([6]), | |||||
'num_genre': tune.choice([25]), | |||||
'num_director': tune.choice([2186]), | |||||
'num_actor': tune.choice([8030]), | |||||
# user | |||||
'num_gender': tune.choice([2]), | |||||
'num_age': tune.choice([7]), | |||||
'num_occupation': tune.choice([21]), | |||||
'num_zipcode': tune.choice([3402]), | |||||
'num_epoch': tune.choice([30]), | |||||
} | } | ||||
scheduler = ASHAScheduler( | scheduler = ASHAScheduler( | ||||
metric_columns=["loss", "ndcg1", "ndcg3", "training_iteration"]) | metric_columns=["loss", "ndcg1", "ndcg3", "training_iteration"]) | ||||
result = tune.run( | result = tune.run( | ||||
partial(train_melu, data_dir=data_dir), | partial(train_melu, data_dir=data_dir), | ||||
resources_per_trial={"cpu": 4, "gpu": gpus_per_trial}, | |||||
resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, | |||||
config=config, | config=config, | ||||
num_samples=num_samples, | num_samples=num_samples, | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
progress_reporter=reporter, | progress_reporter=reporter, | ||||
log_to_file=True, | log_to_file=True, | ||||
# resume=True, | # resume=True, | ||||
local_dir="./hyper_tunning_all_cold", | |||||
local_dir="./hyper_tunning_all_cold2", | |||||
name="melu_all_cold_clustered", | name="melu_all_cold_clustered", | ||||
) | ) | ||||
best_trial = result.get_best_trial("loss", "min", "last") | best_trial = result.get_best_trial("loss", "min", "last") | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
# You can change the number of GPUs per trial here: | # You can change the number of GPUs per trial here: | ||||
main(num_samples=150, max_num_epochs=25, gpus_per_trial=1) | |||||
main(num_samples=150, max_num_epochs=30, gpus_per_trial=1) |
import torch.nn as nn | import torch.nn as nn | ||||
from ray import tune | from ray import tune | ||||
import pickle | import pickle | ||||
from options import config | |||||
# from options import config | |||||
from embedding_module import EmbeddingModule | from embedding_module import EmbeddingModule | ||||
import learn2learn as l2l | import learn2learn as l2l | ||||
import random | import random | ||||
random.shuffle(test_dataset) | random.shuffle(test_dataset) | ||||
random.shuffle(trainset) | random.shuffle(trainset) | ||||
val_size = int(test_set_size * 0.2) | |||||
val_size = int(test_set_size * 0.3) | |||||
validationset = test_dataset[:val_size] | validationset = test_dataset[:val_size] | ||||
testset = test_dataset[val_size:] | testset = test_dataset[val_size:] | ||||
def train_melu(conf, checkpoint_dir=None, data_dir=None): | def train_melu(conf, checkpoint_dir=None, data_dir=None): | ||||
print("inajm1:", checkpoint_dir) | |||||
embedding_dim = conf['embedding_dim'] | embedding_dim = conf['embedding_dim'] | ||||
fc1_in_dim = conf['embedding_dim'] * 8 | fc1_in_dim = conf['embedding_dim'] * 8 | ||||
fc2_in_dim = conf['first_fc_hidden_dim'] | fc2_in_dim = conf['first_fc_hidden_dim'] | ||||
fc2_out_dim = conf['second_fc_hidden_dim'] | fc2_out_dim = conf['second_fc_hidden_dim'] | ||||
# fc1 = torch.nn.Linear(fc1_in_dim, fc2_in_dim) | |||||
# fc2 = torch.nn.Linear(fc2_in_dim, fc2_out_dim) | |||||
# linear_out = torch.nn.Linear(fc2_out_dim, 1) | |||||
# head = torch.nn.Sequential(fc1, fc2, linear_out) | |||||
emb = EmbeddingModule(config).cuda() | |||||
emb = EmbeddingModule(conf).cuda() | |||||
transform = None | transform = None | ||||
if conf['transformer'] == "kronoker": | if conf['transformer'] == "kronoker": | ||||
elif conf['transformer'] == "linear": | elif conf['transformer'] == "linear": | ||||
transform = l2l.optim.ModuleTransform(torch.nn.Linear) | transform = l2l.optim.ModuleTransform(torch.nn.Linear) | ||||
trainer = Trainer(config) | |||||
trainer = Trainer(conf) | |||||
# define meta algorithm | # define meta algorithm | ||||
if conf['meta_algo'] == "maml": | if conf['meta_algo'] == "maml": | ||||
trainer = l2l.algorithms.GBML(trainer, transform=transform, lr=conf['local_lr'], | trainer = l2l.algorithms.GBML(trainer, transform=transform, lr=conf['local_lr'], | ||||
adapt_transform=conf['adapt_transform'], first_order=conf['first_order']) | adapt_transform=conf['adapt_transform'], first_order=conf['first_order']) | ||||
trainer.cuda() | trainer.cuda() | ||||
# net = nn.Sequential(emb, head) | |||||
criterion = nn.MSELoss() | |||||
all_parameters = list(emb.parameters()) + list(trainer.parameters()) | all_parameters = list(emb.parameters()) + list(trainer.parameters()) | ||||
optimizer = torch.optim.Adam(all_parameters, lr=conf['lr']) | optimizer = torch.optim.Adam(all_parameters, lr=conf['lr']) | ||||
a, b, c, d = zip(*train_dataset) | a, b, c, d = zip(*train_dataset) | ||||
for epoch in range(config['num_epoch']): # loop over the dataset multiple times | |||||
for epoch in range(conf['num_epoch']): # loop over the dataset multiple times | |||||
for i in range(num_batch): | for i in range(num_batch): | ||||
optimizer.zero_grad() | optimizer.zero_grad() | ||||
meta_train_error = 0.0 | meta_train_error = 0.0 |
if config['use_cuda']: | if config['use_cuda']: | ||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) | ||||
master_path = "/media/external_10TB/10TB/maheri/define_task_melu_data" | |||||
master_path = "/media/external_10TB/10TB/maheri/define_task_melu_data2" | |||||
config['master_path'] = master_path | config['master_path'] = master_path | ||||
# DATA GENERATION | # DATA GENERATION |
ndcgs3.append(float(ndcg_score([y_true], [y_pred], k=3, sample_weight=None, ignore_ties=False))) | ndcgs3.append(float(ndcg_score([y_true], [y_pred], k=3, sample_weight=None, ignore_ties=False))) | ||||
del supp_xs, supp_ys, query_xs, query_ys, y_true, y_pred, loss_q, temp_sxs, temp_qxs, predictions, l1 | del supp_xs, supp_ys, query_xs, query_ys, y_true, y_pred, loss_q, temp_sxs, temp_qxs, predictions, l1 | ||||
torch.cuda.empty_cache() | |||||
# torch.cuda.empty_cache() | |||||
# calculate metrics | # calculate metrics | ||||
losses_q = np.array(losses_q).mean() | losses_q = np.array(losses_q).mean() |