from hyper_tunning import load_data import os from ray.tune.schedulers import ASHAScheduler from ray.tune import CLIReporter from ray import tune from functools import partial from hyper_tunning import train_melu import numpy as np def main(num_samples, max_num_epochs=20, gpus_per_trial=2): data_dir = os.path.abspath("/media/external_10TB/10TB/maheri/melu_data5") load_data(data_dir) config = { # "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # "lr": tune.loguniform(1e-4, 1e-1), # "batch_size": tune.choice([2, 4, 8, 16]) "transformer": tune.choice(['kronoker']), "meta_algo":tune.choice(['gbml']), "first_order":tune.choice([False]), "adapt_transform":tune.choice([True,False]), # "local_lr":tune.choice([5e-6,5e-4,5e-3]), # "lr":tune.choice([5e-5,5e-4]), "local_lr":tune.loguniform(5e-6,5e-3), "lr":tune.loguniform(5e-5,5e-3), "batch_size":tune.choice([16,32,64]), "inner":tune.choice([7,5,4,3,1]), "test_state":tune.choice(["user_and_item_cold_state"]), # "epochs":tune.choice([5,10,20,25]), } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=30, grace_period=6, reduction_factor=2) reporter = CLIReporter( # parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss", "ndcg1","ndcg3", "training_iteration"]) result = tune.run( partial(train_melu, data_dir=data_dir), resources_per_trial={"cpu": 4, "gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, log_to_file=True, # resume=True, local_dir="./hyper_tunning_all_cold", name="melu_all_cold", ) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation ndcg1: {}".format( best_trial.last_result["ndcg1"])) print("Best trial final validation ndcg3: {}".format( best_trial.last_result["ndcg3"])) # print("=======================================================") print(result.results_df) print("=======================================================\n") # best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) # device = "cpu" # if torch.cuda.is_available(): # device = "cuda:0" # if gpus_per_trial > 1: # best_trained_model = nn.DataParallel(best_trained_model) # best_trained_model.to(device) # # best_checkpoint_dir = best_trial.checkpoint.value # model_state, optimizer_state = torch.load(os.path.join( # best_checkpoint_dir, "checkpoint")) # best_trained_model.load_state_dict(model_state) # # test_acc = test_accuracy(best_trained_model, device) # print("Best trial test set accuracy: {}".format(test_acc)) if __name__ == "__main__": # You can change the number of GPUs per trial here: main(num_samples=150, max_num_epochs=25, gpus_per_trial=1)