Browse Source

using ray for hyper-parameter tanning

RNN
mohamad maheri 2 years ago
parent
commit
6029ca93a1
5 changed files with 234 additions and 20 deletions
  1. 138
    0
      hyper_main.py
  2. 73
    0
      hyper_tunning.py
  3. 10
    8
      models.py
  4. 11
    10
      trainer.py
  5. 2
    2
      utils.py

+ 138
- 0
hyper_main.py View File

from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from ray import tune
from functools import partial
from hyper_tunning import train_metatl
import argparse
import numpy as np
import torch
import random
from trainer import *
from utils import *
from sampler import *
import copy

def get_params():
args = argparse.ArgumentParser()
args.add_argument("-data", "--dataset", default="electronics", type=str)
args.add_argument("-seed", "--seed", default=None, type=int)
args.add_argument("-K", "--K", default=3, type=int) #NUMBER OF SHOT

# args.add_argument("-dim", "--embed_dim", default=100, type=int)
args.add_argument("-bs", "--batch_size", default=1024, type=int)
# args.add_argument("-lr", "--learning_rate", default=0.001, type=float)

args.add_argument("-epo", "--epoch", default=1000, type=int)
# args.add_argument("-prt_epo", "--print_epoch", default=100, type=int)
# args.add_argument("-eval_epo", "--eval_epoch", default=1000, type=int)

# args.add_argument("-b", "--beta", default=5, type=float)
# args.add_argument("-m", "--margin", default=1, type=float)
# args.add_argument("-p", "--dropout_p", default=0.5, type=float)

# args.add_argument("-gpu", "--device", default=1, type=int)

args = args.parse_args()
params = {}
for k, v in vars(args).items():
params[k] = v

params['device'] = torch.device('cuda:0')
return params, args



def main(num_samples, gpus_per_trial=2):

params, args = get_params()

if params['seed'] is not None:
SEED = params['seed']
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
np.random.seed(SEED)
random.seed(SEED)

user_train, usernum_train, itemnum, user_input_test, user_test, user_input_valid, user_valid = data_load(args.dataset, args.K)

batch_size = params['batch_size']
# sampler = WarpSampler(user_train, usernum_train, itemnum, batch_size=batch_size, maxlen=args.K, n_workers=1)
# sampler_test = DataLoader(user_input_test, user_test, itemnum, params)
# sampler_valid = DataLoader(user_input_valid, user_valid, itemnum, params)

config = {
# "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
# "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
# "lr": tune.loguniform(1e-4, 1e-1),
# "batch_size": tune.choice([2, 4, 8, 16])
"embed_dim" : tune.choice([50,75,100,125,150,200]),
# "batch_size" : tune.choice([128,256,512,1024,2048]),
"learning_rate" : tune.choice([0.1,0.01,0.005,0.001,0.0001]),
"beta" : tune.choice([0.1,1,5,10]),
"margin" : tune.choice([1]),

# "sampler":sampler,
# "sampler_test":sampler_test,
# "sampler_valid":sampler_valid,

"itemnum":itemnum,
"params":params,
}

scheduler = ASHAScheduler(
metric="MRR",
mode="max",
max_t=params['epoch'],
grace_period=200,
reduction_factor=2)
reporter = CLIReporter(
# parameter_columns=["l1", "l2", "lr", "batch_size"],
metric_columns=["MRR","NDCG10","NDCG5","NDCG1","Hits10","Hits5","Hits1","training_iteration"])
result = tune.run(
train_metatl,
resources_per_trial={"cpu": 4, "gpu": gpus_per_trial},
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
log_to_file=True,
# resume=True,
local_dir="./ray_local_dir",
name="metatl_rnn1",
)

best_trial = result.get_best_trial("MRR", "max", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
best_trial.last_result["loss"]))
print("Best trial final validation MRR: {}".format(
best_trial.last_result["MRR"]))
print("Best trial final validation NDCG@1: {}".format(
best_trial.last_result["NDCG@1"]))

#
print("=======================================================")
print(result.results_df)
print("=======================================================\n")

# best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
# device = "cpu"
# if torch.cuda.is_available():
# device = "cuda:0"
# if gpus_per_trial > 1:
# best_trained_model = nn.DataParallel(best_trained_model)
# best_trained_model.to(device)
#
# best_checkpoint_dir = best_trial.checkpoint.value
# model_state, optimizer_state = torch.load(os.path.join(
# best_checkpoint_dir, "checkpoint"))
# best_trained_model.load_state_dict(model_state)
#
# test_acc = test_accuracy(best_trained_model, device)
# print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
# You can change the number of GPUs per trial here:
main(num_samples=150, gpus_per_trial=1)

+ 73
- 0
hyper_tunning.py View File

import os
import torch
import torch.nn as nn
from ray import tune
import pickle
import random
import gc
from trainer import Trainer
import numpy as np
from utils import *
from sampler import *
import os


def train_metatl(conf,checkpoint_dir=None):

SEED = conf["params"]['seed']
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
np.random.seed(SEED)
random.seed(SEED)

params = conf['params']
user_train, usernum_train, itemnum, user_input_test, user_test, user_input_valid, user_valid = data_load(params['dataset'], params['K'])
sampler = WarpSampler(user_train, usernum_train, itemnum, batch_size=params['batch_size'], maxlen=params['K'], n_workers=1)
sampler_test = DataLoader(user_input_test, user_test, itemnum, params)
sampler_valid = DataLoader(user_input_valid, user_valid, itemnum, params)

ps = {
"batch_size" : conf["params"]['batch_size'],
"learning_rate" : conf['learning_rate'],
"epoch" : conf["params"]['epoch'],
"beta" : conf['beta'],
"embed_dim" : conf['embed_dim'],
"margin" : conf['margin'],
"K" : conf["params"]['K'],

}

trainer = Trainer([sampler, sampler_valid, sampler_test], conf["itemnum"], ps)

# trainer.train()

for epoch in range(ps['epoch']):

for e in range(100):
# sample one batch from data_loader
train_task, curr_rel = trainer.train_data_loader.next_batch()
loss, _, _ = trainer.do_one_step(train_task, iseval=False, curr_rel=curr_rel)

# do evaluation on specific epoch
valid_data = trainer.eval(istest=False, epoch=(-1))

# print('Epoch {} Testing...'.format(e))
# test_data = self.eval(istest=True, epoch=e)

if checkpoint_dir:
model_state, optimizer_state = torch.load(
os.path.join(checkpoint_dir, "checkpoint"))
trainer.MetaTL.load_state_dict(model_state)
trainer.optimizer.load_state_dict(optimizer_state)

with tune.checkpoint_dir(epoch) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((trainer.MetaTL.state_dict(), trainer.optimizer.state_dict()), path)

tune.report(
MRR=valid_data["MRR"], NDCG10=valid_data['NDCG@10'], NDCG5=valid_data["NDCG@5"], NDCG1=valid_data["NDCG@1"],
Hits10=valid_data["Hits@10"], Hits5=valid_data["Hits@5"], Hits1=valid_data["Hits@1"],
training_iteration=epoch*100
)


+ 10
- 8
models.py View File

class Embedding(nn.Module): class Embedding(nn.Module):
def __init__(self, num_ent, parameter): def __init__(self, num_ent, parameter):
super(Embedding, self).__init__() super(Embedding, self).__init__()
self.device = parameter['device']
self.device = torch.device('cuda:0')
self.es = parameter['embed_dim'] self.es = parameter['embed_dim']
self.embedding = nn.Embedding(num_ent + 1, self.es) self.embedding = nn.Embedding(num_ent + 1, self.es)
super(MetaLearner, self).__init__() super(MetaLearner, self).__init__()
self.embed_size = embed_size self.embed_size = embed_size
self.K = K self.K = K
self.out_size = out_size
self.hidden_size = out_size
self.rnn = nn.LSTM(embed_size,self.hidden_size,1)
# self.out_size = out_size
# self.hidden_size = out_size
self.out_size = embed_size
self.hidden_size = embed_size
self.rnn = nn.LSTM(embed_size,self.hidden_size,2,dropout=0.2)
# nn.init.xavier_normal_(self.rnn.all_weights) # nn.init.xavier_normal_(self.rnn.all_weights)


def forward(self, inputs): def forward(self, inputs):
class MetaTL(nn.Module): class MetaTL(nn.Module):
def __init__(self, itemnum, parameter): def __init__(self, itemnum, parameter):
super(MetaTL, self).__init__() super(MetaTL, self).__init__()
self.device = parameter['device']
self.device = torch.device('cuda:0')
self.beta = parameter['beta'] self.beta = parameter['beta']
self.dropout_p = parameter['dropout_p']
# self.dropout_p = parameter['dropout_p']
self.embed_dim = parameter['embed_dim'] self.embed_dim = parameter['embed_dim']
self.margin = parameter['margin'] self.margin = parameter['margin']
self.embedding = Embedding(itemnum, parameter) self.embedding = Embedding(itemnum, parameter)


self.relation_learner = MetaLearner(parameter['K'] - 1, embed_size=100, num_hidden1=500,
num_hidden2=200, out_size=100, dropout_p=self.dropout_p)
self.relation_learner = MetaLearner(parameter['K'] - 1, embed_size=self.embed_dim, num_hidden1=500,
num_hidden2=200, out_size=100, dropout_p=0)


self.embedding_learner = EmbeddingLearner() self.embedding_learner = EmbeddingLearner()
self.loss_func = nn.MarginRankingLoss(self.margin) self.loss_func = nn.MarginRankingLoss(self.margin)

+ 11
- 10
trainer.py View File

self.batch_size = parameter['batch_size'] self.batch_size = parameter['batch_size']
self.learning_rate = parameter['learning_rate'] self.learning_rate = parameter['learning_rate']
self.epoch = parameter['epoch'] self.epoch = parameter['epoch']
self.print_epoch = parameter['print_epoch']
self.eval_epoch = parameter['eval_epoch']
self.device = parameter['device']
# self.print_epoch = parameter['print_epoch']
# self.eval_epoch = parameter['eval_epoch']
self.eval_epoch = 50
self.device = torch.device('cuda:0')


self.MetaTL = MetaTL(itemnum, parameter) self.MetaTL = MetaTL(itemnum, parameter)
self.MetaTL.to(self.device) self.MetaTL.to(self.device)
train_task, curr_rel = self.train_data_loader.next_batch() train_task, curr_rel = self.train_data_loader.next_batch()
loss, _, _ = self.do_one_step(train_task, iseval=False, curr_rel=curr_rel) loss, _, _ = self.do_one_step(train_task, iseval=False, curr_rel=curr_rel)
# print the loss on specific epoch # print the loss on specific epoch
if e % self.print_epoch == 0:
loss_num = loss.item()
print("Epoch: {}\tLoss: {:.4f}".format(e, loss_num))
# if e % self.print_epoch == 0:
# loss_num = loss.item()
# print("Epoch: {}\tLoss: {:.4f}".format(e, loss_num))
# do evaluation on specific epoch # do evaluation on specific epoch
if e % self.eval_epoch == 0 and e != 0: if e % self.eval_epoch == 0 and e != 0:
print('Epoch {} Validating...'.format(e)) print('Epoch {} Validating...'.format(e))
# print current temp data dynamically # print current temp data dynamically
for k in data.keys(): for k in data.keys():
temp[k] = data[k] / t temp[k] = data[k] / t
sys.stdout.write("{}\tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format(
t, temp['MRR'], temp['NDCG@10'], temp['NDCG@5'], temp['NDCG@1'], temp['Hits@10'], temp['Hits@5'], temp['Hits@1']))
sys.stdout.flush()
# sys.stdout.write("{}\tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format(
# t, temp['MRR'], temp['NDCG@10'], temp['NDCG@5'], temp['NDCG@1'], temp['Hits@10'], temp['Hits@5'], temp['Hits@1']))
# sys.stdout.flush()


# print overall evaluation result and return it # print overall evaluation result and return it
for k in data.keys(): for k in data.keys():


if istest: if istest:
print("TEST: \tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format(
print("TEST: \tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format(
temp['MRR'], temp['NDCG@10'], temp['NDCG@5'], temp['NDCG@1'], temp['Hits@10'], temp['Hits@5'], temp['Hits@1'])) temp['MRR'], temp['NDCG@10'], temp['NDCG@5'], temp['NDCG@1'], temp['Hits@10'], temp['Hits@5'], temp['Hits@1']))
else: else:
print("VALID: \tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format( print("VALID: \tMRR: {:.3f}\tNDCG@10: {:.3f}\tNDCG@5: {:.3f}\tNDCG@1: {:.3f}\tHits@10: {:.3f}\tHits@5: {:.3f}\tHits@1: {:.3f}\r".format(

+ 2
- 2
utils.py View File

user_train = defaultdict(list) user_train = defaultdict(list)


# assume user/item index starting from 1 # assume user/item index starting from 1
f = open('data/%s/%s_train.csv' % (fname, fname), 'r')
f = open('/home/maheri/metaTL/data/%s/%s_train.csv' % (fname, fname), 'r')
for line in f: for line in f:
u, i, t = line.rstrip().split('\t') u, i, t = line.rstrip().split('\t')
u = int(u) u = int(u)




User_test_new = defaultdict(list) User_test_new = defaultdict(list)
f = open('data/%s/%s_test_new_user.csv' % (fname, fname), 'r')
f = open('/home/maheri/metaTL/data/%s/%s_test_new_user.csv' % (fname, fname), 'r')
for line in f: for line in f:
u, i, t = line.rstrip().split('\t') u, i, t = line.rstrip().split('\t')
u = int(u) u = int(u)

Loading…
Cancel
Save